TensorBlock.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // This Source Code Form is subject to the terms of the Mozilla
5 // Public License v. 2.0. If a copy of the MPL was not distributed
6 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 
8 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
9 #define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
10 
11 // IWYU pragma: private
12 #include "./InternalHeaderCheck.h"
13 
14 namespace Eigen {
15 namespace internal {
16 
17 // -------------------------------------------------------------------------- //
18 // Forward declarations for templates defined below.
19 template <typename Scalar, typename IndexType, int NumDims, int Layout>
20 class TensorBlockIO;
21 
22 // -------------------------------------------------------------------------- //
23 // Helper function to compute strides for densely stored buffer of given
24 // dimensions.
25 
26 // TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
27 // this function instead everywhere.
28 template <int Layout, typename IndexType, int NumDims>
31  if (NumDims == 0) return strides;
32 
33  // TODO(ezhulenev): Use templates to unroll this loop (similar to
34  // h_array_reduce in CXX11meta.h)? Benchmark it.
35  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
36  strides[0] = 1;
37  for (int i = 1; i < NumDims; ++i) {
38  strides[i] = strides[i - 1] * dimensions[i - 1];
39  }
40  } else {
41  strides[NumDims - 1] = 1;
42  for (int i = NumDims - 2; i >= 0; --i) {
43  strides[i] = strides[i + 1] * dimensions[i + 1];
44  }
45  }
46 
47  return strides;
48 }
49 
50 template <int Layout, typename IndexType, size_t NumDims>
52  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
53 }
54 
55 template <int Layout, std::ptrdiff_t... Indices>
56 EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(const Sizes<Indices...>& sizes) {
57  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
58 }
59 
60 // -------------------------------------------------------------------------- //
61 
62 // Tensor block shape type defines what are the shape preference for the blocks
63 // extracted from the larger tensor.
64 //
65 // Example: blocks of 100 elements from the large 100x100 tensor:
66 // - tensor: 100x100
67 // - target_block_size: 100
68 //
69 // TensorBlockShapeType:
70 // - kUniformAllDims: 100 blocks of size 10x10
71 // - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
72 // or row major layout)
74 
76  TensorBlockShapeType shape_type; // target block shape
77  size_t size; // target block size
78  TensorOpCost cost_per_coeff; // cost of computing a single block element
79 
80 #ifdef EIGEN_HIPCC
81  // For HIPCC, we need to explicitly declare as a "device fun", the constructor
82  // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
83  // errors out complaining about the lack of a matching constructor
85  : shape_type(shape_type_), size(size_), cost_per_coeff(cost_) {}
86 #endif
87 
88  template <typename Scalar>
90  size_t size_in_bytes, TensorOpCost cost) {
91  const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
92  return {shape_type, size, cost};
93  }
94 
95  template <typename Scalar>
97  size_t size_in_bytes) {
98  // This default cost per coefficient is valid for most materialized tensor
99  // block evaluation implementations, because they typically just read
100  // coefficients from the underlying tensor storage, and write to the tensor
101  // block buffer (scratch or destination memory, reads and writes have linear
102  // access pattern). We ignore the fixed cost of block evaluation, because in
103  // practice it should negligible.
104  //
105  // Lazy block evaluation adds the cost of calling a functor for each
106  // coefficient.
107  //
108  // All non-trivial block evaluation implementations must provide their own
109  // cost approximation (e.g. shuffling inner dimension has a much higher cost
110  // because it reads memory randomly, although the total number of moved
111  // bytes is the same).
112  return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
113  {/*bytes_loaded=*/sizeof(Scalar),
114  /*bytes_stored=*/sizeof(Scalar),
115  /*compute_cycles=*/0});
116  }
117 
118  template <typename Scalar>
120  return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims, size_in_bytes);
121  }
122 
123  template <typename Scalar>
125  return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims, size_in_bytes);
126  }
127 
130  return {merge(lhs.shape_type, rhs.shape_type), // shape_type
131  merge(lhs.size, rhs.size), // size
132  merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff
133  }
134 
136  cost_per_coeff += cost;
137  return *this;
138  }
139 
140  // This is a resource requirement that should be returned from expressions
141  // that do not have any block evaluation preference (e.g. default tensor
142  // expression with raw buffer access).
144  return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
145  }
146 
147  private:
149 
150  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
151  return numext::maxi(lhs_size, rhs_size);
152  }
153 
155  TensorBlockShapeType rhs) {
159  }
160 
162  return lhs_cost + rhs_cost;
163  }
164 };
165 
166 // -------------------------------------------------------------------------- //
167 // TensorBlockDescriptor specifies a block offset within a tensor and the block
168 // sizes along each of the tensor dimensions.
169 
170 template <int NumDims, typename IndexType = Eigen::Index>
172  public:
174 
175  // If we evaluate a Tensor assignment, and expression on the left, already has
176  // a memory buffer, then we might do performance optimization, and evaluate
177  // the root expression directly into the final output memory. Some time it's
178  // possible to reuse it for materializing subexpressions inside an expression
179  // tree, to to avoid dynamic memory allocation.
180  //
181  // The pointer type of the underlying storage is erased, because passing
182  // Scalar type through all the expression evaluation layers is way too many
183  // templates. In practice destination buffer type should always match the
184  // evaluated expression scalar type.
186  public:
188  // The above explicit specification of "int" as the enum basetype is
189  // needed to get around a HIPCC link error ("the field type is not
190  // amp-compatible")
191  // which is issued for class members with the enum type.
192  // TODO(rocm):
193  // remove the "int" basetype once HIPCC has been fixed to not error out
194  // in the above scenario.
195 
196  // Destination buffer is not defined (`m_data` == nullptr).
198 
199  // Tensor block defined by an owning tensor block descriptor can fit
200  // contiguously into the destination buffer. In this case it's safe to
201  // materialize tensor block in the destination buffer, wrap it in a
202  // TensorMap, and use to build Eigen expression on top of it.
204 
205  // Destination buffer strides do not match strides of the contiguously
206  // stored block, and it's impossible to define a TensorMap over this
207  // buffer. However if we are evaluating a root of an expression tree, we
208  // still can materialize an output into this destination, because we can
209  // guarantee that no one will ever access it through block API.
210  //
211  // In theory it is possible to build valid TensorStriding<TensorMap>
212  // expression on top of this destination buffer, however it has
213  // inefficient coeff/packet access, and defeats the purpose of fast block
214  // evaluation API.
215  kStrided
216  };
217 
218  template <typename Scalar>
219  Scalar* data() const {
221  return static_cast<Scalar*>(m_data);
222  }
223 
224  const Dimensions& strides() const { return m_strides; }
225  const DestinationBufferKind& kind() const { return m_kind; }
226 
227  private:
228  friend class TensorBlockDescriptor<NumDims, IndexType>;
229 
231 
232  template <typename Scalar>
234  : m_data(static_cast<void*>(data)), m_data_type_size(sizeof(Scalar)), m_strides(strides), m_kind(kind) {}
235 
236  template <int Layout, typename Scalar>
238  return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
239  }
240 
241  template <int Layout>
243  const Dimensions& desc_dims = desc.dimensions();
244  const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
245  for (int i = 0; i < NumDims; ++i) {
246  if (desc_dims[i] == 1) continue;
247  if (desc_strides[i] != strides[i]) return kStrided;
248  }
249  return kContiguous;
250  }
251 
252  // Storage pointer is type erased, to reduce template bloat, but we still
253  // keep the size of the underlying element type for error checking.
254  void* m_data;
256 
257  // Destination buffer dimensions always match the dimensions of a tensor
258  // block descriptor it belongs to, however strides might be different.
260 
262  };
263 
266 
269 
270  IndexType offset() const { return m_offset; }
271  const Dimensions& dimensions() const { return m_dimensions; }
272  IndexType dimension(int index) const { return m_dimensions[index]; }
273  IndexType size() const { return array_prod<IndexType>(m_dimensions); }
274 
275  const DestinationBuffer& destination() const { return m_destination; }
276 
277  template <int Layout, typename Scalar>
278  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
279  eigen_assert(dst_base != NULL);
280  m_destination = DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
281  }
282 
283  template <int Layout, typename Scalar, typename DstStridesIndexType>
285  // DSizes constructor will do index type promotion if it's safe.
286  AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
287  }
288 
290  m_destination.m_data = NULL;
292  return *this;
293  }
294 
296 
297  // Returns a copy of `*this` with updated offset.
300  }
301 
302  private:
303  // Offset and dimensions are immutable after construction. Block descriptor
304  // can only be mutated by adding or dropping destination.
305  const IndexType m_offset;
308 };
309 
310 // -------------------------------------------------------------------------- //
311 // TensorBlockMapper is responsible for iterating over the blocks of a tensor.
312 
313 template <int NumDims, int Layout, typename IndexType = Eigen::Index>
316 
317  public:
319 
320  TensorBlockMapper() = default;
322  : m_tensor_dimensions(dimensions), m_requirements(requirements) {
323  // Compute block dimensions and the total number of blocks.
325  }
326 
328 
330 
332  return m_block_dimensions;
333  }
334 
336  static const bool isColMajor = Layout == static_cast<int>(ColMajor);
337 
338  IndexType offset = 0;
339  DSizes<IndexType, NumDims> dimensions;
340 
341  if (NumDims == 0) return BlockDescriptor(offset, dimensions);
342 
343  // Iterate outer -> inner dimensions.
344  for (int i = NumDims - 1; i >= 0; --i) {
345  const int dim = isColMajor ? i : NumDims - i - 1;
346 
347  const IndexType idx = block_index / m_block_strides[dim];
348  block_index -= idx * m_block_strides[dim];
349 
350  const IndexType coord = idx * m_block_dimensions[dim];
351  dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, m_block_dimensions[dim]);
352  offset += coord * m_tensor_strides[dim];
353  }
354 
355  return {offset, dimensions};
356  }
357 
358  private:
360  // Requested block shape and size.
362  IndexType target_block_size = numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
363 
364  IndexType tensor_size = m_tensor_dimensions.TotalSize();
365 
366  // Corner case: one of the dimensions is zero. Logic below is too complex
367  // to handle this case on a general basis, just use unit block size.
368  // Note: we must not yield blocks with zero dimensions (recipe for
369  // overflows/underflows, divisions by zero and NaNs later).
370  if (tensor_size == 0) {
371  for (int i = 0; i < NumDims; ++i) {
372  m_block_dimensions[i] = 1;
373  }
375  return;
376  }
377 
378  // If tensor fits into a target block size, evaluate it as a single block.
379  if (tensor_size <= target_block_size) {
382  // The only valid block index is `0`, and in this case we do not need
383  // to compute real strides for tensor or blocks (see blockDescriptor).
384  for (int i = 0; i < NumDims; ++i) {
385  m_tensor_strides[i] = 0;
386  m_block_strides[i] = 1;
387  }
388  return;
389  }
390 
391  static const bool isColMajor = Layout == static_cast<int>(ColMajor);
392 
393  // Block shape skewed towards inner dimension.
394  if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
395  IndexType coeff_to_allocate = target_block_size;
396 
397  for (int i = 0; i < NumDims; ++i) {
398  const int dim = isColMajor ? i : NumDims - i - 1;
399  m_block_dimensions[dim] = numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
400  coeff_to_allocate =
401  numext::div_ceil(coeff_to_allocate, numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
402  }
403  eigen_assert(coeff_to_allocate == 1);
404 
405  } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
406  // Tensor will not fit within 'target_block_size' budget: calculate tensor
407  // block dimension sizes based on "square" dimension size target.
408  const IndexType dim_size_target = convert_index<IndexType>(
409  std::pow(static_cast<float>(target_block_size), 1.0f / static_cast<float>(m_block_dimensions.rank())));
410 
411  for (int i = 0; i < NumDims; ++i) {
412  // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
413  // a multiple of the packet size. Note that reducing
414  // 'block_dim_size' in this manner can increase the number of
415  // blocks, and so will amplify any per-block overhead.
416  m_block_dimensions[i] = numext::mini(dim_size_target, m_tensor_dimensions[i]);
417  }
418 
419  // Add any un-allocated coefficients to inner dimension(s).
420  IndexType total_size = m_block_dimensions.TotalSize();
421  for (int i = 0; i < NumDims; ++i) {
422  const int dim = isColMajor ? i : NumDims - i - 1;
423 
424  if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
425  const IndexType total_size_other_dims = total_size / m_block_dimensions[dim];
426  const IndexType alloc_avail = numext::div_ceil<IndexType>(target_block_size, total_size_other_dims);
427  if (alloc_avail == m_block_dimensions[dim]) {
428  // Insufficient excess coefficients to allocate.
429  break;
430  }
431  m_block_dimensions[dim] = numext::mini(m_tensor_dimensions[dim], alloc_avail);
432  total_size = total_size_other_dims * m_block_dimensions[dim];
433  }
434  }
435 
436  } else {
437  eigen_assert(false); // unknown block shape
438  }
439 
441  numext::mini<IndexType>(target_block_size, m_tensor_dimensions.TotalSize()));
442 
443  // Calculate block counts by dimension and total block count.
444  DSizes<IndexType, NumDims> block_count;
445  for (int i = 0; i < NumDims; ++i) {
447  }
448  m_total_block_count = array_prod(block_count);
449 
450  // Calculate block strides (used for enumerating blocks).
451  m_tensor_strides = strides<Layout>(m_tensor_dimensions);
452  m_block_strides = strides<Layout>(block_count);
453  }
454 
457 
460 
463 };
464 
465 // -------------------------------------------------------------------------- //
466 // TensorBlockScratchAllocator is responsible for allocating temporary buffers
467 // for block evaluation (output or input block materialization). Given that
468 // Eigen expression traversal order is deterministic, all temporary allocations
469 // are happening in the same order, and usually have exactly the same size.
470 // Scratch allocator keeps a trace of all dynamic allocations, and after the
471 // first block evaluation is completed, we should be able to reuse all the
472 // temporary buffers for the next block evaluation.
473 
474 template <typename Device>
476  public:
477  explicit TensorBlockScratchAllocator(const Device& device) : m_device(device), m_allocation_index(0) {}
478 
480  for (size_t i = 0; i < m_allocations.size(); ++i) {
481  m_device.deallocate(m_allocations[i].ptr);
482  }
483  }
484 
485  void* allocate(size_t size) {
486  // TODO(ezhulenev): Remove when replaced with inlined vector.
487  if (m_allocations.capacity() == 0) m_allocations.reserve(8);
488 
489  // Check if we already have an existing allocation att current index.
490  const int num_allocations = static_cast<int>(m_allocations.size());
491  const bool has_allocation = m_allocation_index < num_allocations;
492 
493  // Allocation index can't be larger than the number of allocations.
494  eigen_assert(m_allocation_index <= num_allocations);
495 
496  // If we have existing allocation, and its size is larger or equal to
497  // requested size, we do nothing.
498 
499  // If current allocation can't fit requested size, we deallocate it, and
500  // replace with a larger allocation.
501  if (has_allocation && m_allocations[m_allocation_index].size < size) {
502  m_device.deallocate(m_allocations[m_allocation_index].ptr);
505  }
506 
507  // Make a new allocation if we don't have and existing one.
508  if (!has_allocation) {
509  Allocation allocation;
510  allocation.ptr = m_device.allocate(size);
511  allocation.size = size;
512  m_allocations.push_back(allocation);
513  }
514 
517 
518  return m_allocations[m_allocation_index++].ptr;
519  }
520 
521  void reset() { m_allocation_index = 0; }
522 
523  private:
524  struct Allocation {
525  void* ptr;
526  size_t size;
527  };
528 
529  const Device& m_device;
531  // TODO(ezhulenev): This should be an inlined vector.
532  std::vector<Allocation> m_allocations;
533 };
534 
535 // -------------------------------------------------------------------------- //
536 // TensorBlockKind represents all possible block kinds, that can be produced by
537 // TensorEvaluator::evalBlock function.
539  // Tensor block that is a lazy expression that must be assigned to a
540  // destination using TensorBlockAssign.
542 
543  // Tensor block that is a view into a memory buffer owned by an underlying
544  // Tensor expression (e.g. it can be a view into a Tensor buffer).
546 
547  // Tensor block that was materialized in a scratch memory buffer, allocated
548  // with TensorBlockScratchAllocator. This block must be copied to a
549  // destination, similar to a block of `kExpr` type.
551 
552  // Tensor block that was materialized directly into the final output memory
553  // buffer. For example if the left side of an assignment is a Tensor, we can
554  // directly materialize the block in the destination memory.
555  //
556  // If strides in the output buffer do not match tensor block strides, the
557  // Tensor expression will be invalid, and should not be used by
558  // TensorBlockAssign or for constructing another block expression.
560 };
561 
562 // -------------------------------------------------------------------------- //
563 // TensorBlockNotImplemented should be used to defined TensorBlock typedef in
564 // TensorEvaluators that do not support block evaluation.
565 
567  public:
568  typedef void XprType;
569 };
570 
571 // -------------------------------------------------------------------------- //
572 // XprScalar extracts Scalar type from the Eigen expressions (if expression type
573 // is not void). It's required to be able to define lazy block expression for
574 // argument types, that do not support block evaluation.
575 
576 template <typename XprType>
577 struct XprScalar {
578  typedef typename XprType::Scalar type;
579 };
580 template <>
581 struct XprScalar<void> {
582  typedef void type;
583 };
584 
585 // -------------------------------------------------------------------------- //
586 // TensorMaterializedBlock is a fully evaluated block of the original tensor,
587 // and XprType is just a TensorMap over the data. This block type is typically
588 // used to materialize blocks of tensor expressions, that can't be efficiently
589 // represented as lazy Tensor expressions with fast coeff/packet operations,
590 // e.g. we materialize all broadcasts into evaluated blocks.
591 //
592 // TensorMaterializedBlock does not own its memory buffer, it's either a memory
593 // buffer that backs the original expression (e.g. block is just a view into a
594 // Tensor), or a memory buffer allocated with scratch allocator, and in this
595 // case the scratch allocator will deallocate it at the end of block based
596 // expression execution.
597 //
598 // If the block was evaluated directly into the output buffer, and strides in
599 // the output buffer do not match block strides, the TensorMap expression will
600 // be invalid, and should never be used in block assignment or any other tensor
601 // expression.
602 
603 template <typename Scalar, int NumDims, int Layout, typename IndexType = Eigen::Index>
605  public:
608 
610  bool valid_expr = true)
611  : m_kind(kind), m_data(data), m_dimensions(dimensions), m_expr(m_data, m_dimensions), m_valid_expr(valid_expr) {
615  }
616 
617  TensorBlockKind kind() const { return m_kind; }
618  // NOTE(ezhulenev): Returning XprType by value like in other block types
619  // causes asan failures. The theory is that XprType::Nested doesn't work
620  // properly for TensorMap.
621  const XprType& expr() const {
623  return m_expr;
624  }
625  const Scalar* data() const { return m_data; }
626  void cleanup() {}
627 
629 
630  // TensorMaterializedBlock can be backed by different types of storage:
631  //
632  // (1) Contiguous block of memory allocated with scratch allocator.
633  // (2) Contiguous block of memory reused from tensor block descriptor
634  // destination buffer.
635  // (3) Strided block of memory reused from tensor block descriptor
636  // destination buffer.
637  //
638  class Storage {
639  public:
640  Scalar* data() const { return m_data; }
641  const Dimensions& dimensions() const { return m_dimensions; }
642  const Dimensions& strides() const { return m_strides; }
643 
648  }
649 
650  private:
651  friend class TensorMaterializedBlock<Scalar, NumDims, Layout, IndexType>;
652 
653  Storage(Scalar* data, const Dimensions& dimensions, const Dimensions& strides, bool materialized_in_output,
654  bool strided_storage)
655  : m_data(data),
658  m_materialized_in_output(materialized_in_output),
659  m_strided_storage(strided_storage) {}
660 
666  };
667 
668  // Creates a storage for materialized block either from the block descriptor
669  // destination buffer, or allocates a new buffer with scratch allocator.
670  template <typename TensorBlockScratch>
671  EIGEN_STRONG_INLINE static Storage prepareStorage(TensorBlockDesc& desc, TensorBlockScratch& scratch,
672  bool allow_strided_storage = false) {
673  // Try to reuse destination as an output block buffer.
674  typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
675 
676  if (desc.destination().kind() == DestinationBuffer::kContiguous) {
677  Scalar* buffer = desc.destination().template data<Scalar>();
678  desc.DropDestinationBuffer();
679  return Storage(buffer, desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
680  /*materialized_in_output=*/true,
681  /*strided_storage=*/false);
682 
683  } else if (desc.destination().kind() == DestinationBuffer::kStrided && allow_strided_storage) {
684  Scalar* buffer = desc.destination().template data<Scalar>();
685  desc.DropDestinationBuffer();
686  return Storage(buffer, desc.dimensions(), desc.destination().strides(),
687  /*materialized_in_output=*/true, /*strided_storage=*/true);
688 
689  } else {
690  void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
691  return Storage(static_cast<Scalar*>(mem), desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
692  /*materialized_in_output=*/false,
693  /*strided_storage=*/false);
694  }
695  }
696 
697  // Creates a materialized block for the given descriptor from a memory buffer.
698  template <typename DataDimensions, typename TensorBlockScratch>
699  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(const Scalar* data, const DataDimensions& data_dims,
700  TensorBlockDesc& desc, TensorBlockScratch& scratch) {
702 
703  // If a tensor block dimensions covers a contiguous block of the underlying
704  // memory, we can skip block buffer memory allocation, and construct a block
705  // from existing `data` memory buffer.
706  //
707  // Example: (RowMajor layout)
708  // data_dims: [11, 12, 13, 14]
709  // desc.dimensions(): [1, 1, 3, 14]
710  //
711  // In this case we can construct a TensorBlock starting at
712  // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
713  static const bool is_col_major = Layout == ColMajor;
714 
715  // Find out how many inner dimensions have a matching size.
716  int num_matching_inner_dims = 0;
717  for (int i = 0; i < NumDims; ++i) {
718  int dim = is_col_major ? i : NumDims - i - 1;
719  if (data_dims[dim] != desc.dimensions()[dim]) break;
720  ++num_matching_inner_dims;
721  }
722 
723  // All the outer dimensions must be of size `1`, except a single dimension
724  // before the matching inner dimension (`3` in the example above).
725  bool can_use_direct_access = true;
726  for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
727  int dim = is_col_major ? i : NumDims - i - 1;
728  if (desc.dimension(dim) != 1) {
729  can_use_direct_access = false;
730  break;
731  }
732  }
733 
734  if (can_use_direct_access) {
735  const Scalar* block_start = data + desc.offset();
737 
738  } else {
739  // Reuse destination buffer or allocate new buffer with scratch allocator.
740  const Storage storage = prepareStorage(desc, scratch);
741 
743  typedef typename TensorBlockIO::Dst TensorBlockIODst;
744  typedef typename TensorBlockIO::Src TensorBlockIOSrc;
745 
746  TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)), data, desc.offset());
747  TensorBlockIODst dst(storage.dimensions(), storage.strides(), storage.data());
748 
749  TensorBlockIO::Copy(dst, src);
750  return storage.AsTensorMaterializedBlock();
751  }
752  }
753 
754  private:
756  const Scalar* m_data;
760 };
761 
762 // -------------------------------------------------------------------------- //
763 // TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
764 // functor to the blocks produced by the underlying Tensor expression.
765 
766 template <typename UnaryOp, typename ArgTensorBlock>
769 
770  public:
771  typedef std::conditional_t<NoArgBlockAccess, void,
774 
776 
777  TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
778  : m_arg_block(arg_block), m_functor(functor) {}
779 
781 
782  XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
783  const Scalar* data() const { return NULL; }
784  void cleanup() { m_arg_block.cleanup(); }
785 
786  private:
787  ArgTensorBlock m_arg_block;
788  UnaryOp m_functor;
789 };
790 
791 // -------------------------------------------------------------------------- //
792 // TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
793 // functor to the blocks produced by the underlying Tensor expression.
794 
795 template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
799 
800  public:
801  typedef std::conditional_t<
802  NoArgBlockAccess, void,
805 
807 
808  TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, const RhsTensorBlock& right_block, const BinaryOp& functor)
809  : m_left_block(left_block), m_right_block(right_block), m_functor(functor) {}
810 
812 
813  XprType expr() const { return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); }
814 
815  const Scalar* data() const { return NULL; }
816 
817  void cleanup() {
818  m_left_block.cleanup();
819  m_right_block.cleanup();
820  }
821 
822  private:
823  LhsTensorBlock m_left_block;
824  RhsTensorBlock m_right_block;
825  BinaryOp m_functor;
826 };
827 
828 // -------------------------------------------------------------------------- //
829 // TensorUnaryExprBlock is a lazy tensor expression block that can construct
830 // an arbitrary tensor expression from a block of the underlying type (this is a
831 // generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
832 
833 template <typename BlockFactory, typename ArgTensorBlock>
837 
838  public:
840 
842 
843  TensorUnaryExprBlock(const ArgTensorBlock& arg_block, const BlockFactory& factory)
844  : m_arg_block(arg_block), m_factory(factory) {}
845 
847  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
848  const Scalar* data() const { return NULL; }
849  void cleanup() { m_arg_block.cleanup(); }
850 
851  private:
852  ArgTensorBlock m_arg_block;
853  BlockFactory m_factory;
854 };
855 
856 // -------------------------------------------------------------------------- //
857 // TensorTernaryExprBlock is a lazy tensor expression block that can construct
858 // an arbitrary tensor expression from three blocks of the underlying type.
859 
860 template <typename BlockFactory, typename Arg1TensorBlock, typename Arg2TensorBlock, typename Arg3TensorBlock>
865 
869 
870  public:
871  typedef std::conditional_t<NoArgBlockAccess, void,
872  typename BlockFactory::template XprType<Arg1XprType, Arg2XprType, Arg3XprType>::type>
874 
876 
877  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, const Arg2TensorBlock& arg2_block,
878  const Arg3TensorBlock& arg3_block, const BlockFactory& factory)
879  : m_arg1_block(arg1_block), m_arg2_block(arg2_block), m_arg3_block(arg3_block), m_factory(factory) {}
880 
882  XprType expr() const { return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), m_arg3_block.expr()); }
883  const Scalar* data() const { return NULL; }
884  void cleanup() {
885  m_arg1_block.cleanup();
886  m_arg2_block.cleanup();
887  m_arg3_block.cleanup();
888  }
889 
890  private:
891  Arg1TensorBlock m_arg1_block;
892  Arg2TensorBlock m_arg2_block;
893  Arg3TensorBlock m_arg3_block;
894  BlockFactory m_factory;
895 };
896 
897 // -------------------------------------------------------------------------- //
898 // StridedLinearBufferCopy provides a method to copy data between two linear
899 // buffers with different strides, with optimized paths for scatter/gather.
900 
901 template <typename Scalar, typename IndexType>
905  enum {
909  HasHalfPacket = static_cast<int>(HalfPacketSize) < static_cast<int>(PacketSize)
910  };
911 
912  public:
913  // Specifying linear copy kind statically gives ~30% speedup for small sizes.
914  enum class Kind {
915  Linear = 0, // src_stride == 1 && dst_stride == 1
916  Scatter = 1, // src_stride == 1 && dst_stride != 1
917  FillLinear = 2, // src_stride == 0 && dst_stride == 1
918  FillScatter = 3, // src_stride == 0 && dst_stride != 1
919  Gather = 4, // dst_stride == 1
920  Random = 5 // everything else
921  };
922 
923  struct Dst {
924  Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
925 
926  IndexType offset;
927  IndexType stride;
929  };
930 
931  struct Src {
932  Src(IndexType o, IndexType s, const Scalar* d) : offset(o), stride(s), data(d) {}
933 
934  IndexType offset;
935  IndexType stride;
936  const Scalar* data;
937  };
938 
939  template <typename StridedLinearBufferCopy::Kind kind>
940  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, const Src& src, const size_t count) {
941  Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, src.data);
942  }
943 
944  private:
945  template <typename StridedLinearBufferCopy::Kind kind>
946  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const IndexType count, const IndexType dst_offset,
947  const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
948  const IndexType src_offset, const IndexType src_stride,
949  const Scalar* EIGEN_RESTRICT src_data) {
950  const Scalar* src = &src_data[src_offset];
951  Scalar* dst = &dst_data[dst_offset];
952 
953  if (!Vectorizable) {
954  for (Index i = 0; i < count; ++i) {
955  dst[i * dst_stride] = src[i * src_stride];
956  }
957  return;
958  }
959 
960  const IndexType vectorized_size = PacketSize * (count / PacketSize);
961  IndexType i = 0;
962 
964  // ******************************************************************** //
965  // Linear copy from `src` to `dst`.
966  const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
967  eigen_assert(src_stride == 1 && dst_stride == 1);
968  for (; i < unrolled_size; i += 4 * PacketSize) {
969  for (int j = 0; j < 4; ++j) {
970  Packet p = ploadu<Packet>(src + i + j * PacketSize);
971  pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
972  }
973  }
974  for (; i < vectorized_size; i += PacketSize) {
975  Packet p = ploadu<Packet>(src + i);
976  pstoreu<Scalar, Packet>(dst + i, p);
977  }
978  if (HasHalfPacket) {
979  const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
980  if (i < vectorized_half_size) {
981  HalfPacket p = ploadu<HalfPacket>(src + i);
982  pstoreu<Scalar, HalfPacket>(dst + i, p);
983  i += HalfPacketSize;
984  }
985  }
986  for (; i < count; ++i) {
987  dst[i] = src[i];
988  }
989  // ******************************************************************** //
990  } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
991  // Scatter from `src` to `dst`.
992  eigen_assert(src_stride == 1 && dst_stride != 1);
993  for (; i < vectorized_size; i += PacketSize) {
994  Packet p = ploadu<Packet>(src + i);
995  pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
996  }
997  if (HasHalfPacket) {
998  const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
999  if (i < vectorized_half_size) {
1000  HalfPacket p = ploadu<HalfPacket>(src + i);
1001  pscatter<Scalar, HalfPacket>(dst + i * dst_stride, p, dst_stride);
1002  i += HalfPacketSize;
1003  }
1004  }
1005  for (; i < count; ++i) {
1006  dst[i * dst_stride] = src[i];
1007  }
1008  // ******************************************************************** //
1009  } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
1010  // Fill `dst` with value at `*src`.
1011  eigen_assert(src_stride == 0 && dst_stride == 1);
1012 
1013  const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
1014  Scalar s = *src;
1015  Packet p = pset1<Packet>(s);
1016  for (; i < unrolled_size; i += 4 * PacketSize) {
1017  for (int j = 0; j < 4; ++j) {
1018  pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1019  }
1020  }
1021  for (; i < vectorized_size; i += PacketSize) {
1022  pstoreu<Scalar, Packet>(dst + i, p);
1023  }
1024  if (HasHalfPacket) {
1025  const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
1026  if (i < vectorized_half_size) {
1027  HalfPacket hp = pset1<HalfPacket>(s);
1028  pstoreu<Scalar, HalfPacket>(dst + i, hp);
1029  i += HalfPacketSize;
1030  }
1031  }
1032  for (; i < count; ++i) {
1033  dst[i] = s;
1034  }
1035  // ******************************************************************** //
1036  } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
1037  // Scatter `*src` into `dst`.
1038  eigen_assert(src_stride == 0 && dst_stride != 1);
1039  Scalar s = *src;
1040  Packet p = pset1<Packet>(s);
1041  for (; i < vectorized_size; i += PacketSize) {
1042  pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1043  }
1044  if (HasHalfPacket) {
1045  const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
1046  if (i < vectorized_half_size) {
1047  HalfPacket hp = pset1<HalfPacket>(s);
1048  pscatter<Scalar, HalfPacket>(dst + i * dst_stride, hp, dst_stride);
1049  i += HalfPacketSize;
1050  }
1051  }
1052  for (; i < count; ++i) {
1053  dst[i * dst_stride] = s;
1054  }
1055  // ******************************************************************** //
1056  } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
1057  // Gather from `src` into `dst`.
1058  eigen_assert(dst_stride == 1);
1059  for (; i < vectorized_size; i += PacketSize) {
1060  Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
1061  pstoreu<Scalar, Packet>(dst + i, p);
1062  }
1063  if (HasHalfPacket) {
1064  const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
1065  if (i < vectorized_half_size) {
1066  HalfPacket p = pgather<Scalar, HalfPacket>(src + i * src_stride, src_stride);
1067  pstoreu<Scalar, HalfPacket>(dst + i, p);
1068  i += HalfPacketSize;
1069  }
1070  }
1071  for (; i < count; ++i) {
1072  dst[i] = src[i * src_stride];
1073  }
1074  // ******************************************************************** //
1075  } else if (kind == StridedLinearBufferCopy::Kind::Random) {
1076  // Random.
1077  for (; i < count; ++i) {
1078  dst[i * dst_stride] = src[i * src_stride];
1079  }
1080  } else {
1081  eigen_assert(false);
1082  }
1083  }
1084 };
1085 
1086 // -------------------------------------------------------------------------- //
1087 // TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
1088 // It's possible to specify src->dst dimension mapping for the copy operation.
1089 // Dimensions of `dst` specify how many elements have to be copied, for the
1090 // `src` we need to know only stride to navigate through source memory buffer.
1091 
1092 template <typename Scalar, typename IndexType, int NumDims, int Layout>
1094  static constexpr bool IsColMajor = (Layout == ColMajor);
1095 
1097 
1098  public:
1101 
1102  struct Dst {
1103  Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, IndexType dst_offset = 0)
1104  : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
1105 
1109  IndexType offset;
1110  };
1111 
1112  struct Src {
1113  Src(const Dimensions& src_strides, const Scalar* src, IndexType src_offset = 0)
1114  : strides(src_strides), data(src), offset(src_offset) {}
1115 
1117  const Scalar* data;
1118  IndexType offset;
1119  };
1120 
1121  // Copies data to `dst` from `src`, using provided dimensions mapping:
1122  //
1123  // src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
1124  //
1125  // Returns the number of copied elements.
1126  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(const Dst& dst, const Src& src,
1127  const DimensionsMap& dst_to_src_dim_map) {
1128  // Copy single scalar value from `src` to `dst`.
1129  if (NumDims == 0) {
1130  *(dst.data + dst.offset) = *(src.data + src.offset);
1131  return 1;
1132  }
1133 
1134  // Both `dst` and `src` must have contiguous innermost dimension. We also
1135  // accept the special case with stride '0', because it's used as a trick to
1136  // implement broadcasting.
1137  {
1138  int inner_dim = IsColMajor ? 0 : NumDims - 1;
1139  EIGEN_UNUSED_VARIABLE(inner_dim);
1140  eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
1141  eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
1142  }
1143 
1144  // Give a shorter name to `dst_to_src_dim_map`.
1145  const DimensionsMap& dim_map = dst_to_src_dim_map;
1146 
1147  // Do not squeeze reordered inner dimensions.
1148  int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
1149 
1150  // NOTE: We find the innermost dimension (contiguous in memory) in the dst
1151  // block, and we write data linearly into that dimension, reading it from
1152  // the src. If dimensions are reordered, we might end up reading data from
1153  // the src with `stride != 1`.
1154  //
1155  // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
1156  // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
1157 
1158  // Find the innermost dimension in the dst whose size is not 1. This is the
1159  // effective inner dim.
1160  int num_size_one_inner_dims = 0;
1161  for (int i = 0; i < num_squeezable_dims; ++i) {
1162  const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1163  if (dst.dims[dst_dim] != 1) break;
1164  num_size_one_inner_dims++;
1165  }
1166 
1167  // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
1168  if (num_size_one_inner_dims == NumDims) {
1169  *(dst.data + dst.offset) = *(src.data + src.offset);
1170  return 1;
1171  }
1172 
1173  // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
1174  const int dst_stride1_dim = IsColMajor ? num_size_one_inner_dims : NumDims - num_size_one_inner_dims - 1;
1175 
1176  // Dimension in the src that corresponds to the dst innermost dimension.
1177  const int src_dim_for_dst_stride1_dim = NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
1178 
1179  // Size of the innermost dimension (length of contiguous blocks of memory).
1180  IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
1181 
1182  // Squeeze multiple inner dims into one if they are contiguous in `dst` and
1183  // `src` memory, so we can do less linear copy calls.
1184  for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
1185  const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1186  const IndexType dst_stride = dst.strides[dst_dim];
1187  const IndexType src_stride = src.strides[dim_map[dst_dim]];
1188  if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
1189  dst_inner_dim_size *= dst.dims[dst_dim];
1190  ++num_size_one_inner_dims;
1191  } else {
1192  break;
1193  }
1194  }
1195 
1196  // Setup strides to read data from `src` and write to `dst`.
1197  IndexType input_offset = src.offset;
1198  IndexType output_offset = dst.offset;
1199  IndexType input_stride = NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
1200  IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
1201 
1202  const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
1204 
1205  // Initialize block iterator state. Squeeze away any dimension of size 1.
1206  int idx = 0; // currently initialized iterator state index
1207  for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
1208  const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
1209  if (dst.dims[dst_dim] == 1) continue;
1210 
1211  it[idx].size = dst.dims[dst_dim];
1212  it[idx].input_stride = src.strides[dim_map[dst_dim]];
1213  it[idx].output_stride = dst.strides[dst_dim];
1214 
1215  it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
1216  it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1217 
1218  idx++;
1219  }
1220 
1221  // Iterate copying data from src to dst.
1222  const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
1223 
1224 #define COPY_INNER_DIM(KIND) \
1225  IndexType num_copied = 0; \
1226  for (num_copied = 0; num_copied < block_total_size; num_copied += dst_inner_dim_size) { \
1227  LinCopy::template Run<KIND>(typename LinCopy::Dst(output_offset, output_stride, dst.data), \
1228  typename LinCopy::Src(input_offset, input_stride, src.data), dst_inner_dim_size); \
1229  \
1230  for (int j = 0; j < idx; ++j) { \
1231  if (++it[j].count < it[j].size) { \
1232  input_offset += it[j].input_stride; \
1233  output_offset += it[j].output_stride; \
1234  break; \
1235  } \
1236  it[j].count = 0; \
1237  input_offset -= it[j].input_span; \
1238  output_offset -= it[j].output_span; \
1239  } \
1240  } \
1241  return num_copied;
1242 
1243  if (input_stride == 1 && output_stride == 1) {
1245  } else if (input_stride == 1 && output_stride != 1) {
1247  } else if (input_stride == 0 && output_stride == 1) {
1249  } else if (input_stride == 0 && output_stride != 1) {
1251  } else if (output_stride == 1) {
1253  } else {
1255  }
1256 
1257 #undef COPY_INNER_DIM
1258  }
1259 
1260  // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
1261  // the number of copied elements.
1262  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, const Src& src) {
1263  DimensionsMap dst_to_src_map;
1264  for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
1265  return Copy(dst, src, dst_to_src_map);
1266  }
1267 
1268  private:
1271 
1272  IndexType size;
1273  IndexType count;
1274  IndexType input_stride;
1275  IndexType output_stride;
1276  IndexType input_span;
1277  IndexType output_span;
1278  };
1279 
1280  // Compute how many inner dimensions it's allowed to squeeze when doing IO
1281  // between two tensor blocks. It's safe to squeeze inner dimensions, only
1282  // if they are not reordered.
1283  static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
1284  int num_squeezable_dims = 0;
1285  for (int i = 0; i < NumDims; ++i) {
1286  const int dim = IsColMajor ? i : NumDims - i - 1;
1287  if (dim_map[dim] != dim) break;
1288  num_squeezable_dims++;
1289  }
1290  return num_squeezable_dims;
1291  }
1292 };
1293 
1294 // -------------------------------------------------------------------------- //
1295 // TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
1296 // a Tensor block defined by `desc`, backed by a memory buffer at `target`.
1297 //
1298 // Currently there is no way to write from a Tensor expression to a block of
1299 // memory, if dimensions are reordered. If you need to do that, you should
1300 // materialize a Tensor block expression into a memory buffer, and then use
1301 // TensorBlockIO to copy data between two memory buffers with a custom
1302 // `target->src` dimension map (see definition above).
1303 //
1304 // Also currently the innermost dimension of `target` must have a stride '1'
1305 // (contiguous in memory). This restriction could be lifted with a `pscatter`,
1306 // but in practice it's never needed, and there is a similar TensorBlockIO
1307 // workaround for that.
1308 //
1309 // TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
1310 // where `src` is a tensor expression. Explore if it is possible to rewrite IO
1311 // to use expressions instead of pointers, and after that TensorBlockAssignment
1312 // will become an alias to IO.
1313 template <typename Scalar, int NumDims, typename TensorBlockExpr, typename IndexType = Eigen::Index>
1315  // We will use coeff/packet path to evaluate block expressions.
1317 
1319 
1321 
1322  template <bool Vectorizable, typename Evaluator>
1324  EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) {
1325  for (IndexType i = 0; i < count; ++i) {
1326  target[i] = eval.coeff(eval_offset + i);
1327  }
1328  }
1329  };
1330 
1331  template <typename Evaluator>
1332  struct InnerDimAssign<true, Evaluator> {
1333  EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) {
1334  typedef typename packet_traits<Scalar>::type Packet;
1335 
1336  const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
1337  const IndexType vectorized_size = PacketSize * (count / PacketSize);
1338  IndexType i = 0;
1339 
1340  for (; i < unrolled_size; i += 4 * PacketSize) {
1341  for (int j = 0; j < 4; ++j) {
1342  const IndexType idx = eval_offset + i + j * PacketSize;
1343  Packet p = eval.template packet<Unaligned>(idx);
1344  pstoreu<Scalar>(target + i + j * PacketSize, p);
1345  }
1346  }
1347 
1348  for (; i < vectorized_size; i += PacketSize) {
1349  Packet p = eval.template packet<Unaligned>(eval_offset + i);
1350  pstoreu<Scalar>(target + i, p);
1351  }
1352 
1353  for (; i < count; ++i) {
1354  target[i] = eval.coeff(eval_offset + i);
1355  }
1356  }
1357  };
1358 
1359  public:
1360  struct Target {
1361  Target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data,
1362  IndexType target_offset = 0)
1363  : dims(target_dims), strides(target_strides), data(target_data), offset(target_offset) {}
1364 
1368  IndexType offset;
1369  };
1370 
1371  static Target target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data,
1372  IndexType target_offset = 0) {
1373  return Target(target_dims, target_strides, target_data, target_offset);
1374  }
1375 
1376  template <typename TargetDimsIndexType, typename TargetStridesIndexType>
1378  const DSizes<TargetStridesIndexType, NumDims>& target_strides, Scalar* target_data,
1379  IndexType target_offset = 0) {
1380  // DSizes constructor will do index type promotion if it's safe.
1381  return Target(Dimensions(target_dims), Dimensions(target_strides), target_data, target_offset);
1382  }
1383 
1384  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Target& target, const TensorBlockExpr& expr) {
1385  // Prepare evaluator for block expression.
1386  DefaultDevice default_device;
1387  TensorBlockEvaluator eval(expr, default_device);
1388 
1389  // Tensor block expression dimension should match destination dimensions.
1390  eigen_assert(dimensions_match(target.dims, eval.dimensions()));
1391 
1392  static const int Layout = TensorBlockEvaluator::Layout;
1393  static const bool is_col_major = Layout == ColMajor;
1394 
1395  // Initialize output inner dimension size based on a layout.
1396  const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
1397  const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
1398  IndexType output_inner_dim_size = target.dims[inner_dim_idx];
1399 
1400  // Target inner dimension stride must be '1'.
1401  eigen_assert(target.strides[inner_dim_idx] == 1);
1402 
1403  // Squeeze multiple inner dims into one if they are contiguous in `target`.
1404  IndexType num_squeezed_dims = 0;
1405  for (Index i = 1; i < NumDims; ++i) {
1406  const Index dim = is_col_major ? i : NumDims - i - 1;
1407  const IndexType target_stride = target.strides[dim];
1408 
1409  if (output_inner_dim_size == target_stride) {
1410  output_inner_dim_size *= target.dims[dim];
1411  num_squeezed_dims++;
1412  } else {
1413  break;
1414  }
1415  }
1416 
1417  // Initialize output block iterator state. Dimension in this array are
1418  // always in inner_most -> outer_most order (col major layout).
1420 
1421  int idx = 0; // currently initialized iterator state index
1422  for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
1423  const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
1424 
1425  it[idx].count = 0;
1426  it[idx].size = target.dims[dim];
1427  it[idx].output_stride = target.strides[dim];
1428  it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1429  idx++;
1430  }
1431 
1432  // We read block expression from the beginning, and start writing data to
1433  // `target` at given offset.
1434  IndexType input_offset = 0;
1435  IndexType output_offset = target.offset;
1436 
1437  // Iterate copying data from `eval` to `target`.
1438  for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
1439  // Assign to `target` at current offset.
1441  target.data + output_offset, output_inner_dim_size, eval, input_offset);
1442 
1443  // Move input offset forward by the number of assigned coefficients.
1444  input_offset += output_inner_dim_size;
1445 
1446  // Update index.
1447  for (int j = 0; j < idx; ++j) {
1448  if (++it[j].count < it[j].size) {
1449  output_offset += it[j].output_stride;
1450  break;
1451  }
1452  it[j].count = 0;
1453  output_offset -= it[j].output_span;
1454  }
1455  }
1456  }
1457 
1458  private:
1461 
1462  IndexType count;
1463  IndexType size;
1464  IndexType output_stride;
1465  IndexType output_span;
1466  };
1467 };
1468 
1469 // -------------------------------------------------------------------------- //
1470 
1471 } // namespace internal
1472 } // namespace Eigen
1473 
1474 #endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_RESTRICT
Definition: Macros.h:1067
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:966
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
#define COPY_INNER_DIM(KIND)
float * p
Definition: Tutorial_Map_using.cpp:9
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
SCALAR Scalar
Definition: bench_gemm.cpp:45
internal::packet_traits< Scalar >::type Packet
Definition: benchmark-blocking-sizes.cpp:54
Definition: TensorExpr.h:162
Definition: TensorExpr.h:97
A tensor expression mapping an existing array of data.
Definition: TensorMap.h:33
Definition: TensorCostModel.h:28
Definition: TensorBlock.h:902
packet_traits< Scalar >::type Packet
Definition: TensorBlock.h:903
Kind
Definition: TensorBlock.h:914
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst &dst, const Src &src, const size_t count)
Definition: TensorBlock.h:940
@ HalfPacketSize
Definition: TensorBlock.h:908
@ PacketSize
Definition: TensorBlock.h:907
@ Vectorizable
Definition: TensorBlock.h:906
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const IndexType count, const IndexType dst_offset, const IndexType dst_stride, Scalar *EIGEN_RESTRICT dst_data, const IndexType src_offset, const IndexType src_stride, const Scalar *EIGEN_RESTRICT src_data)
Definition: TensorBlock.h:946
unpacket_traits< Packet >::half HalfPacket
Definition: TensorBlock.h:904
Definition: TensorBlock.h:1314
DSizes< IndexType, NumDims > Dimensions
Definition: TensorBlock.h:1318
@ PacketSize
Definition: TensorBlock.h:1320
@ Vectorizable
Definition: TensorBlock.h:1320
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Target &target, const TensorBlockExpr &expr)
Definition: TensorBlock.h:1384
static Target target(const Dimensions &target_dims, const Dimensions &target_strides, Scalar *target_data, IndexType target_offset=0)
Definition: TensorBlock.h:1371
static Target target(const DSizes< TargetDimsIndexType, NumDims > &target_dims, const DSizes< TargetStridesIndexType, NumDims > &target_strides, Scalar *target_data, IndexType target_offset=0)
Definition: TensorBlock.h:1377
TensorEvaluator< const TensorBlockExpr, DefaultDevice > TensorBlockEvaluator
Definition: TensorBlock.h:1316
static DestinationBuffer make(const TensorBlockDescriptor &desc, Scalar *data, const Dimensions &strides)
Definition: TensorBlock.h:237
static DestinationBufferKind kind(const TensorBlockDescriptor &desc, const Dimensions &strides)
Definition: TensorBlock.h:242
DestinationBufferKind m_kind
Definition: TensorBlock.h:261
Dimensions m_strides
Definition: TensorBlock.h:259
const DestinationBufferKind & kind() const
Definition: TensorBlock.h:225
const Dimensions & strides() const
Definition: TensorBlock.h:224
Scalar * data() const
Definition: TensorBlock.h:219
size_t m_data_type_size
Definition: TensorBlock.h:255
DestinationBuffer(Scalar *data, const Dimensions &strides, DestinationBufferKind kind)
Definition: TensorBlock.h:233
Definition: TensorBlock.h:171
TensorBlockDescriptor WithOffset(IndexType offset) const
Definition: TensorBlock.h:298
void AddDestinationBuffer(Scalar *dst_base, const DSizes< DstStridesIndexType, NumDims > &dst_strides)
Definition: TensorBlock.h:284
void AddDestinationBuffer(Scalar *dst_base, const Dimensions &dst_strides)
Definition: TensorBlock.h:278
DSizes< IndexType, NumDims > Dimensions
Definition: TensorBlock.h:173
IndexType size() const
Definition: TensorBlock.h:273
const DestinationBuffer & destination() const
Definition: TensorBlock.h:275
IndexType offset() const
Definition: TensorBlock.h:270
bool HasDestinationBuffer() const
Definition: TensorBlock.h:295
TensorBlockDescriptor & DropDestinationBuffer()
Definition: TensorBlock.h:289
IndexType dimension(int index) const
Definition: TensorBlock.h:272
const Dimensions & dimensions() const
Definition: TensorBlock.h:271
const IndexType m_offset
Definition: TensorBlock.h:305
DestinationBuffer m_destination
Definition: TensorBlock.h:307
const Dimensions m_dimensions
Definition: TensorBlock.h:306
TensorBlockDescriptor(const IndexType offset, const Dimensions &dimensions, const DestinationBuffer &destination)
Definition: TensorBlock.h:264
TensorBlockDescriptor(const IndexType offset, const Dimensions &dimensions)
Definition: TensorBlock.h:267
Definition: TensorBlock.h:1093
static int NumSqueezableInnerDims(const DimensionsMap &dim_map)
Definition: TensorBlock.h:1283
static constexpr bool IsColMajor
Definition: TensorBlock.h:1094
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(const Dst &dst, const Src &src, const DimensionsMap &dst_to_src_dim_map)
Definition: TensorBlock.h:1126
StridedLinearBufferCopy< Scalar, IndexType > LinCopy
Definition: TensorBlock.h:1096
DSizes< IndexType, NumDims > Dimensions
Definition: TensorBlock.h:1099
DSizes< int, NumDims > DimensionsMap
Definition: TensorBlock.h:1100
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst &dst, const Src &src)
Definition: TensorBlock.h:1262
Definition: TensorBlock.h:314
IndexType m_total_block_count
Definition: TensorBlock.h:459
TensorBlockResourceRequirements m_requirements
Definition: TensorBlock.h:456
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes< IndexType, NumDims > & blockDimensions() const
Definition: TensorBlock.h:331
TensorBlockMapper(const DSizes< IndexType, NumDims > &dimensions, const TensorBlockResourceRequirements &requirements)
Definition: TensorBlock.h:321
DSizes< IndexType, NumDims > m_block_dimensions
Definition: TensorBlock.h:458
TensorBlockDescriptor< NumDims, IndexType > BlockDescriptor
Definition: TensorBlock.h:315
DSizes< IndexType, NumDims > m_tensor_strides
Definition: TensorBlock.h:461
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const
Definition: TensorBlock.h:327
DSizes< IndexType, NumDims > m_tensor_dimensions
Definition: TensorBlock.h:455
DSizes< IndexType, NumDims > m_block_strides
Definition: TensorBlock.h:462
void InitializeBlockDimensions()
Definition: TensorBlock.h:359
DSizes< IndexType, NumDims > Dimensions
Definition: TensorBlock.h:318
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const
Definition: TensorBlock.h:329
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index) const
Definition: TensorBlock.h:335
Definition: TensorBlock.h:566
void XprType
Definition: TensorBlock.h:568
void * allocate(size_t size)
Definition: TensorBlock.h:485
int m_allocation_index
Definition: TensorBlock.h:530
const Device & m_device
Definition: TensorBlock.h:529
~TensorBlockScratchAllocator()
Definition: TensorBlock.h:479
std::vector< Allocation > m_allocations
Definition: TensorBlock.h:532
TensorBlockScratchAllocator(const Device &device)
Definition: TensorBlock.h:477
void reset()
Definition: TensorBlock.h:521
Definition: TensorBlock.h:796
XprScalar< XprType >::type Scalar
Definition: TensorBlock.h:806
TensorCwiseBinaryBlock(const LhsTensorBlock &left_block, const RhsTensorBlock &right_block, const BinaryOp &functor)
Definition: TensorBlock.h:808
TensorBlockKind kind() const
Definition: TensorBlock.h:811
static constexpr bool NoArgBlockAccess
Definition: TensorBlock.h:797
XprType expr() const
Definition: TensorBlock.h:813
RhsTensorBlock m_right_block
Definition: TensorBlock.h:824
std::conditional_t< NoArgBlockAccess, void, TensorCwiseBinaryOp< BinaryOp, const typename LhsTensorBlock::XprType, const typename RhsTensorBlock::XprType > > XprType
Definition: TensorBlock.h:804
void cleanup()
Definition: TensorBlock.h:817
const Scalar * data() const
Definition: TensorBlock.h:815
LhsTensorBlock m_left_block
Definition: TensorBlock.h:823
BinaryOp m_functor
Definition: TensorBlock.h:825
Definition: TensorBlock.h:767
ArgTensorBlock m_arg_block
Definition: TensorBlock.h:787
static constexpr bool NoArgBlockAccess
Definition: TensorBlock.h:768
std::conditional_t< NoArgBlockAccess, void, TensorCwiseUnaryOp< UnaryOp, const typename ArgTensorBlock::XprType > > XprType
Definition: TensorBlock.h:773
XprScalar< XprType >::type Scalar
Definition: TensorBlock.h:775
TensorCwiseUnaryBlock(const ArgTensorBlock &arg_block, const UnaryOp &functor)
Definition: TensorBlock.h:777
UnaryOp m_functor
Definition: TensorBlock.h:788
void cleanup()
Definition: TensorBlock.h:784
const Scalar * data() const
Definition: TensorBlock.h:783
TensorBlockKind kind() const
Definition: TensorBlock.h:780
XprType expr() const
Definition: TensorBlock.h:782
bool m_materialized_in_output
Definition: TensorBlock.h:664
TensorMaterializedBlock AsTensorMaterializedBlock() const
Definition: TensorBlock.h:644
bool m_strided_storage
Definition: TensorBlock.h:665
Dimensions m_strides
Definition: TensorBlock.h:663
Scalar * m_data
Definition: TensorBlock.h:661
Scalar * data() const
Definition: TensorBlock.h:640
Dimensions m_dimensions
Definition: TensorBlock.h:662
Storage(Scalar *data, const Dimensions &dimensions, const Dimensions &strides, bool materialized_in_output, bool strided_storage)
Definition: TensorBlock.h:653
const Dimensions & strides() const
Definition: TensorBlock.h:642
const Dimensions & dimensions() const
Definition: TensorBlock.h:641
Definition: TensorBlock.h:604
TensorMaterializedBlock(TensorBlockKind kind, const Scalar *data, const Dimensions &dimensions, bool valid_expr=true)
Definition: TensorBlock.h:609
static EIGEN_STRONG_INLINE TensorMaterializedBlock materialize(const Scalar *data, const DataDimensions &data_dims, TensorBlockDesc &desc, TensorBlockScratch &scratch)
Definition: TensorBlock.h:699
const Scalar * data() const
Definition: TensorBlock.h:625
const XprType & expr() const
Definition: TensorBlock.h:621
TensorBlockKind kind() const
Definition: TensorBlock.h:617
static EIGEN_STRONG_INLINE Storage prepareStorage(TensorBlockDesc &desc, TensorBlockScratch &scratch, bool allow_strided_storage=false)
Definition: TensorBlock.h:671
bool m_valid_expr
Definition: TensorBlock.h:759
XprType m_expr
Definition: TensorBlock.h:758
TensorBlockKind m_kind
Definition: TensorBlock.h:755
const Scalar * m_data
Definition: TensorBlock.h:756
Dimensions m_dimensions
Definition: TensorBlock.h:757
internal::TensorBlockDescriptor< NumDims, IndexType > TensorBlockDesc
Definition: TensorBlock.h:628
DSizes< IndexType, NumDims > Dimensions
Definition: TensorBlock.h:606
TensorMap< const Tensor< Scalar, NumDims, Layout > > XprType
Definition: TensorBlock.h:607
void cleanup()
Definition: TensorBlock.h:626
Definition: TensorBlock.h:861
XprType expr() const
Definition: TensorBlock.h:882
TensorBlockKind kind() const
Definition: TensorBlock.h:881
Arg1TensorBlock::XprType Arg1XprType
Definition: TensorBlock.h:862
TensorTernaryExprBlock(const Arg1TensorBlock &arg1_block, const Arg2TensorBlock &arg2_block, const Arg3TensorBlock &arg3_block, const BlockFactory &factory)
Definition: TensorBlock.h:877
std::conditional_t< NoArgBlockAccess, void, typename BlockFactory::template XprType< Arg1XprType, Arg2XprType, Arg3XprType >::type > XprType
Definition: TensorBlock.h:873
const Scalar * data() const
Definition: TensorBlock.h:883
Arg3TensorBlock::XprType Arg3XprType
Definition: TensorBlock.h:864
BlockFactory m_factory
Definition: TensorBlock.h:894
Arg3TensorBlock m_arg3_block
Definition: TensorBlock.h:893
static constexpr bool NoArgBlockAccess
Definition: TensorBlock.h:866
Arg1TensorBlock m_arg1_block
Definition: TensorBlock.h:891
Arg2TensorBlock m_arg2_block
Definition: TensorBlock.h:892
XprScalar< XprType >::type Scalar
Definition: TensorBlock.h:875
void cleanup()
Definition: TensorBlock.h:884
Arg2TensorBlock::XprType Arg2XprType
Definition: TensorBlock.h:863
Definition: TensorBlock.h:834
TensorUnaryExprBlock(const ArgTensorBlock &arg_block, const BlockFactory &factory)
Definition: TensorBlock.h:843
XprType expr() const
Definition: TensorBlock.h:847
XprScalar< XprType >::type Scalar
Definition: TensorBlock.h:841
void cleanup()
Definition: TensorBlock.h:849
ArgTensorBlock::XprType ArgXprType
Definition: TensorBlock.h:835
static constexpr bool NoArgBlockAccess
Definition: TensorBlock.h:836
const Scalar * data() const
Definition: TensorBlock.h:848
std::conditional_t< NoArgBlockAccess, void, typename BlockFactory::template XprType< ArgXprType >::type > XprType
Definition: TensorBlock.h:839
BlockFactory m_factory
Definition: TensorBlock.h:853
ArgTensorBlock m_arg_block
Definition: TensorBlock.h:852
TensorBlockKind kind() const
Definition: TensorBlock.h:846
std::vector< Array2i > sizes
Definition: dense_solvers.cpp:12
@ ColMajor
Definition: Constants.h:318
RealScalar s
Definition: level1_cplx_impl.h:130
@ Target
Definition: Constants.h:495
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 pow(const bfloat16 &a, const bfloat16 &b)
Definition: BFloat16.h:625
TensorBlockKind
Definition: TensorBlock.h:538
@ kMaterializedInOutput
Definition: TensorBlock.h:559
@ kMaterializedInScratch
Definition: TensorBlock.h:550
@ kView
Definition: TensorBlock.h:545
@ kExpr
Definition: TensorBlock.h:541
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
Definition: MoreMeta.h:497
EIGEN_ALWAYS_INLINE DSizes< IndexType, NumDims > strides(const DSizes< IndexType, NumDims > &dimensions)
Definition: TensorBlock.h:29
TensorBlockShapeType
Definition: TensorBlock.h:73
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
Definition: MathFunctions.h:926
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T div_ceil(T a, T b)
Definition: MathFunctions.h:1251
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition: MathFunctions.h:920
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2)
Definition: TensorDimensions.h:322
std::array< T, N > array
Definition: EmulateArray.h:231
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
type
Definition: compute_granudrum_aor.py:141
Definition: Eigen_Colamd.h:49
CwiseBinaryOp< internal::scalar_sum_op< double, double >, const CpyMatrixXd, const CpyMatrixXd > XprType
Definition: nestbyvalue.cpp:15
internal::nested_eval< T, 1 >::type eval(const T &xpr)
Definition: sparse_permutations.cpp:47
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const
Definition: TensorDimensions.h:167
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const
Definition: TensorDimensions.h:165
Definition: TensorDeviceDefault.h:19
Definition: TensorDimensions.h:85
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31
static constexpr int Layout
Definition: TensorEvaluator.h:46
Dst(IndexType o, IndexType s, Scalar *d)
Definition: TensorBlock.h:924
IndexType offset
Definition: TensorBlock.h:926
Scalar * data
Definition: TensorBlock.h:928
IndexType stride
Definition: TensorBlock.h:927
const Scalar * data
Definition: TensorBlock.h:936
Src(IndexType o, IndexType s, const Scalar *d)
Definition: TensorBlock.h:932
IndexType offset
Definition: TensorBlock.h:934
IndexType stride
Definition: TensorBlock.h:935
IndexType output_span
Definition: TensorBlock.h:1465
IndexType size
Definition: TensorBlock.h:1463
IndexType output_stride
Definition: TensorBlock.h:1464
IndexType count
Definition: TensorBlock.h:1462
static EIGEN_ALWAYS_INLINE void Run(Scalar *target, IndexType count, const Evaluator &eval, IndexType eval_offset)
Definition: TensorBlock.h:1333
static EIGEN_ALWAYS_INLINE void Run(Scalar *target, IndexType count, const Evaluator &eval, IndexType eval_offset)
Definition: TensorBlock.h:1324
Dimensions strides
Definition: TensorBlock.h:1366
Dimensions dims
Definition: TensorBlock.h:1365
Scalar * data
Definition: TensorBlock.h:1367
Target(const Dimensions &target_dims, const Dimensions &target_strides, Scalar *target_data, IndexType target_offset=0)
Definition: TensorBlock.h:1361
IndexType offset
Definition: TensorBlock.h:1368
IndexType input_stride
Definition: TensorBlock.h:1274
IndexType count
Definition: TensorBlock.h:1273
IndexType size
Definition: TensorBlock.h:1272
IndexType output_span
Definition: TensorBlock.h:1277
BlockIteratorState()
Definition: TensorBlock.h:1270
IndexType output_stride
Definition: TensorBlock.h:1275
IndexType input_span
Definition: TensorBlock.h:1276
Definition: TensorBlock.h:1102
IndexType offset
Definition: TensorBlock.h:1109
Dimensions strides
Definition: TensorBlock.h:1107
Dimensions dims
Definition: TensorBlock.h:1106
Scalar * data
Definition: TensorBlock.h:1108
Dst(const Dimensions &dst_dims, const Dimensions &dst_strides, Scalar *dst, IndexType dst_offset=0)
Definition: TensorBlock.h:1103
Definition: TensorBlock.h:1112
const Scalar * data
Definition: TensorBlock.h:1117
Dimensions strides
Definition: TensorBlock.h:1116
IndexType offset
Definition: TensorBlock.h:1118
Src(const Dimensions &src_strides, const Scalar *src, IndexType src_offset=0)
Definition: TensorBlock.h:1113
EIGEN_DEVICE_FUNC TensorBlockResourceRequirements & addCostPerCoeff(TensorOpCost cost)
Definition: TensorBlock.h:135
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type, size_t size_in_bytes)
Definition: TensorBlock.h:96
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type, size_t size_in_bytes, TensorOpCost cost)
Definition: TensorBlock.h:89
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements skewed(size_t size_in_bytes)
Definition: TensorBlock.h:119
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockShapeType merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs)
Definition: TensorBlock.h:154
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size)
Definition: TensorBlock.h:150
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockResourceRequirements any()
Definition: TensorBlock.h:143
size_t size
Definition: TensorBlock.h:77
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockResourceRequirements merge(const TensorBlockResourceRequirements &lhs, const TensorBlockResourceRequirements &rhs)
Definition: TensorBlock.h:129
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements uniform(size_t size_in_bytes)
Definition: TensorBlock.h:124
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, TensorOpCost rhs_cost)
Definition: TensorBlock.h:161
TensorOpCost cost_per_coeff
Definition: TensorBlock.h:78
TensorBlockShapeType shape_type
Definition: TensorBlock.h:76
void type
Definition: TensorBlock.h:582
Definition: TensorBlock.h:577
XprType::Scalar type
Definition: TensorBlock.h:578
Definition: Meta.h:305
Definition: XprHelper.h:427
Definition: Meta.h:214
Definition: GenericPacketMath.h:108
Definition: GenericPacketMath.h:134
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2
Definition: ZVector/PacketMath.h:50