d7/d22/TensorBlock_8h_source.html

 // This file is part of Eigen, a lightweight C++ template library

 // for linear algebra.

 //

 // This Source Code Form is subject to the terms of the Mozilla

 // Public License v. 2.0. If a copy of the MPL was not distributed

 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.


 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H

 #define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H


 // IWYU pragma: private

 #include "./InternalHeaderCheck.h"


 namespace Eigen {

 namespace internal {


 // -------------------------------------------------------------------------- //

 // Forward declarations for templates defined below.

 template <typename Scalar, typename IndexType, int NumDims, int Layout>

 class TensorBlockIO;


 // -------------------------------------------------------------------------- //

 // Helper function to compute strides for densely stored buffer of given

 // dimensions.


 // TODO(ezhulenev): We compute strides 1000 times in different evaluators, use

 // this function instead everywhere.

 template <int Layout, typename IndexType, int NumDims>

 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(const DSizes<IndexType, NumDims>& dimensions) {

   DSizes<IndexType, NumDims> strides;

   if (NumDims == 0) return strides;


   // TODO(ezhulenev): Use templates to unroll this loop (similar to

   // h_array_reduce in CXX11meta.h)? Benchmark it.

   if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {

     strides[0] = 1;

     for (int i = 1; i < NumDims; ++i) {

       strides[i] = strides[i - 1] * dimensions[i - 1];

     }

   } else {

     strides[NumDims - 1] = 1;

     for (int i = NumDims - 2; i >= 0; --i) {

       strides[i] = strides[i + 1] * dimensions[i + 1];

     }

   }


   return strides;

 }


 template <int Layout, typename IndexType, size_t NumDims>

 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(const Eigen::array<IndexType, NumDims>& dimensions) {

   return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));

 }


 template <int Layout, std::ptrdiff_t... Indices>

 EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(const Sizes<Indices...>& sizes) {

   return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));

 }


 // -------------------------------------------------------------------------- //


 // Tensor block shape type defines what are the shape preference for the blocks

 // extracted from the larger tensor.

 //

 // Example: blocks of 100 elements from the large 100x100 tensor:

 // - tensor: 100x100

 // - target_block_size: 100

 //

 // TensorBlockShapeType:

 //  - kUniformAllDims: 100 blocks of size 10x10

 //  - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column

 //                      or row major layout)

 enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };


 struct TensorBlockResourceRequirements {

   TensorBlockShapeType shape_type;  // target block shape

   size_t size;                      // target block size

   TensorOpCost cost_per_coeff;      // cost of computing a single block element


 #ifdef EIGEN_HIPCC

   // For HIPCC, we need to explicitly declare as a "device fun", the constructor

   // which is implicitly invoked in the "merge" / "any" routines. else HIPCC

   // errors out complaining about the lack of a matching constructor

   EIGEN_DEVICE_FUNC TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_, TensorOpCost cost_)

       : shape_type(shape_type_), size(size_), cost_per_coeff(cost_) {}

 #endif


   template <typename Scalar>

   EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type,

                                                                             size_t size_in_bytes, TensorOpCost cost) {

     const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));

     return {shape_type, size, cost};

   }


   template <typename Scalar>

   EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type,

                                                                             size_t size_in_bytes) {

     // This default cost per coefficient is valid for most materialized tensor

     // block evaluation implementations, because they typically just read

     // coefficients from the underlying tensor storage, and write to the tensor

     // block buffer (scratch or destination memory, reads and writes have linear

     // access pattern). We ignore the fixed cost of block evaluation, because in

     // practice it should negligible.

     //

     // Lazy block evaluation adds the cost of calling a functor for each

     // coefficient.

     //

     // All non-trivial block evaluation implementations must provide their own

     // cost approximation (e.g. shuffling inner dimension has a much higher cost

     // because it reads memory randomly, although the total number of moved

     // bytes is the same).

     return withShapeAndSize<Scalar>(shape_type, size_in_bytes,

                                     {/*bytes_loaded=*/sizeof(Scalar),

                                      /*bytes_stored=*/sizeof(Scalar),

                                      /*compute_cycles=*/0});

   }


   template <typename Scalar>

   EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(size_t size_in_bytes) {

     return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims, size_in_bytes);

   }


   template <typename Scalar>

   EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(size_t size_in_bytes) {

     return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims, size_in_bytes);

   }


   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockResourceRequirements

   merge(const TensorBlockResourceRequirements& lhs, const TensorBlockResourceRequirements& rhs) {

     return {merge(lhs.shape_type, rhs.shape_type),           // shape_type

             merge(lhs.size, rhs.size),                       // size

             merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};  // cost_per_coeff

   }


   EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(TensorOpCost cost) {

     cost_per_coeff += cost;

     return *this;

   }


   // This is a resource requirement that should be returned from expressions

   // that do not have any block evaluation preference (e.g. default tensor

   // expression with raw buffer access).

   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {

     return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};

   }


  private:

   using Requirements = TensorBlockResourceRequirements;


   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {

     return numext::maxi(lhs_size, rhs_size);

   }


   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockShapeType merge(TensorBlockShapeType lhs,

                                                                           TensorBlockShapeType rhs) {

     return (lhs == TensorBlockShapeType::kSkewedInnerDims || rhs == TensorBlockShapeType::kSkewedInnerDims)

                ? TensorBlockShapeType::kSkewedInnerDims

                : TensorBlockShapeType::kUniformAllDims;

   }


   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, TensorOpCost rhs_cost) {

     return lhs_cost + rhs_cost;

   }

 };


 // -------------------------------------------------------------------------- //

 // TensorBlockDescriptor specifies a block offset within a tensor and the block

 // sizes along each of the tensor dimensions.


 template <int NumDims, typename IndexType = Eigen::Index>

 class TensorBlockDescriptor {

  public:

   typedef DSizes<IndexType, NumDims> Dimensions;


   // If we evaluate a Tensor assignment, and expression on the left, already has

   // a memory buffer, then we might do performance optimization, and evaluate

   // the root expression directly into the final output memory. Some time it's

   // possible to reuse it for materializing subexpressions inside an expression

   // tree, to to avoid dynamic memory allocation.

   //

   // The pointer type of the underlying storage is erased, because passing

   // Scalar type through all the expression evaluation layers is way too many

   // templates. In practice destination buffer type should always match the

   // evaluated expression scalar type.

   class DestinationBuffer {

    public:

     enum DestinationBufferKind : int {

       // The above explicit specification of "int" as the enum basetype is

       // needed to get around a HIPCC link error ("the field type is not

       // amp-compatible")

       // which is issued for class members with the enum type.

       // TODO(rocm):

       // remove the "int" basetype once HIPCC has been fixed to not error out

       // in the above scenario.


       // Destination buffer is not defined (`m_data` == nullptr).

       kEmpty,


       // Tensor block defined by an owning tensor block descriptor can fit

       // contiguously into the destination buffer. In this case it's safe to

       // materialize tensor block in the destination buffer, wrap it in a

       // TensorMap, and use to build Eigen expression on top of it.

       kContiguous,


       // Destination buffer strides do not match strides of the contiguously

       // stored block, and it's impossible to define a TensorMap over this

       // buffer. However if we are evaluating a root of an expression tree, we

       // still can materialize an output into this destination, because we can

       // guarantee that no one will ever access it through block API.

       //

       // In theory it is possible to build valid TensorStriding<TensorMap>

       // expression on top of this destination buffer, however it has

       // inefficient coeff/packet access, and defeats the purpose of fast block

       // evaluation API.

       kStrided

     };


     template <typename Scalar>

     Scalar* data() const {

       eigen_assert(m_data_type_size == sizeof(Scalar));

       return static_cast<Scalar*>(m_data);

     }


     const Dimensions& strides() const { return m_strides; }

     const DestinationBufferKind& kind() const { return m_kind; }


    private:

     friend class TensorBlockDescriptor<NumDims, IndexType>;


     DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}


     template <typename Scalar>

     DestinationBuffer(Scalar* data, const Dimensions& strides, DestinationBufferKind kind)

         : m_data(static_cast<void*>(data)), m_data_type_size(sizeof(Scalar)), m_strides(strides), m_kind(kind) {}


     template <int Layout, typename Scalar>

     static DestinationBuffer make(const TensorBlockDescriptor& desc, Scalar* data, const Dimensions& strides) {

       return DestinationBuffer(data, strides, kind<Layout>(desc, strides));

     }


     template <int Layout>

     static DestinationBufferKind kind(const TensorBlockDescriptor& desc, const Dimensions& strides) {

       const Dimensions& desc_dims = desc.dimensions();

       const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);

       for (int i = 0; i < NumDims; ++i) {

         if (desc_dims[i] == 1) continue;

         if (desc_strides[i] != strides[i]) return kStrided;

       }

       return kContiguous;

     }


     // Storage pointer is type erased, to reduce template bloat, but we still

     // keep the size of the underlying element type for error checking.

     void* m_data;

     size_t m_data_type_size;


     // Destination buffer dimensions always match the dimensions of a tensor

     // block descriptor it belongs to, however strides might be different.

     Dimensions m_strides;


     DestinationBufferKind m_kind;

   };


   TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, const DestinationBuffer& destination)

       : m_offset(offset), m_dimensions(dimensions), m_destination(destination) {}


   TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)

       : m_offset(offset), m_dimensions(dimensions), m_destination(DestinationBuffer()) {}


   IndexType offset() const { return m_offset; }

   const Dimensions& dimensions() const { return m_dimensions; }

   IndexType dimension(int index) const { return m_dimensions[index]; }

   IndexType size() const { return array_prod<IndexType>(m_dimensions); }


   const DestinationBuffer& destination() const { return m_destination; }


   template <int Layout, typename Scalar>

   void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {

     eigen_assert(dst_base != NULL);

     m_destination = DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);

   }


   template <int Layout, typename Scalar, typename DstStridesIndexType>

   void AddDestinationBuffer(Scalar* dst_base, const DSizes<DstStridesIndexType, NumDims>& dst_strides) {

     // DSizes constructor will do index type promotion if it's safe.

     AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));

   }


   TensorBlockDescriptor& DropDestinationBuffer() {

     m_destination.m_data = NULL;

     m_destination.m_kind = DestinationBuffer::kEmpty;

     return *this;

   }


   bool HasDestinationBuffer() const { return m_destination.kind() != DestinationBuffer::kEmpty; }


   // Returns a copy of `*this` with updated offset.

   TensorBlockDescriptor WithOffset(IndexType offset) const {

     return TensorBlockDescriptor(offset, m_dimensions, m_destination);

   }


  private:

   // Offset and dimensions are immutable after construction. Block descriptor

   // can only be mutated by adding or dropping destination.

   const IndexType m_offset;

   const Dimensions m_dimensions;

   DestinationBuffer m_destination;

 };


 // -------------------------------------------------------------------------- //

 // TensorBlockMapper is responsible for iterating over the blocks of a tensor.


 template <int NumDims, int Layout, typename IndexType = Eigen::Index>

 class TensorBlockMapper {

   typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;


  public:

   typedef DSizes<IndexType, NumDims> Dimensions;


   TensorBlockMapper() = default;

   TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions, const TensorBlockResourceRequirements& requirements)

       : m_tensor_dimensions(dimensions), m_requirements(requirements) {

     // Compute block dimensions and the total number of blocks.

     InitializeBlockDimensions();

   }


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const { return m_total_block_count; }


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const { return m_block_dimensions.TotalSize(); }


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>& blockDimensions() const {

     return m_block_dimensions;

   }


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index) const {

     static const bool isColMajor = Layout == static_cast<int>(ColMajor);


     IndexType offset = 0;

     DSizes<IndexType, NumDims> dimensions;


     if (NumDims == 0) return BlockDescriptor(offset, dimensions);


     // Iterate outer -> inner dimensions.

     for (int i = NumDims - 1; i >= 0; --i) {

       const int dim = isColMajor ? i : NumDims - i - 1;


       const IndexType idx = block_index / m_block_strides[dim];

       block_index -= idx * m_block_strides[dim];


       const IndexType coord = idx * m_block_dimensions[dim];

       dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, m_block_dimensions[dim]);

       offset += coord * m_tensor_strides[dim];

     }


     return {offset, dimensions};

   }


  private:

   void InitializeBlockDimensions() {

     // Requested block shape and size.

     const TensorBlockShapeType shape_type = m_requirements.shape_type;

     IndexType target_block_size = numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));


     IndexType tensor_size = m_tensor_dimensions.TotalSize();


     // Corner case: one of the dimensions is zero. Logic below is too complex

     // to handle this case on a general basis, just use unit block size.

     // Note: we must not yield blocks with zero dimensions (recipe for

     // overflows/underflows, divisions by zero and NaNs later).

     if (tensor_size == 0) {

       for (int i = 0; i < NumDims; ++i) {

         m_block_dimensions[i] = 1;

       }

       m_total_block_count = 0;

       return;

     }


     // If tensor fits into a target block size, evaluate it as a single block.

     if (tensor_size <= target_block_size) {

       m_block_dimensions = m_tensor_dimensions;

       m_total_block_count = 1;

       // The only valid block index is `0`, and in this case we do not need

       // to compute real strides for tensor or blocks (see blockDescriptor).

       for (int i = 0; i < NumDims; ++i) {

         m_tensor_strides[i] = 0;

         m_block_strides[i] = 1;

       }

       return;

     }


     static const bool isColMajor = Layout == static_cast<int>(ColMajor);


     // Block shape skewed towards inner dimension.

     if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {

       IndexType coeff_to_allocate = target_block_size;


       for (int i = 0; i < NumDims; ++i) {

         const int dim = isColMajor ? i : NumDims - i - 1;

         m_block_dimensions[dim] = numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);

         coeff_to_allocate =

             numext::div_ceil(coeff_to_allocate, numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));

       }

       eigen_assert(coeff_to_allocate == 1);


     } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {

       // Tensor will not fit within 'target_block_size' budget: calculate tensor

       // block dimension sizes based on "square" dimension size target.

       const IndexType dim_size_target = convert_index<IndexType>(

           std::pow(static_cast<float>(target_block_size), 1.0f / static_cast<float>(m_block_dimensions.rank())));


       for (int i = 0; i < NumDims; ++i) {

         // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it

         // a multiple of the packet size. Note that reducing

         // 'block_dim_size' in this manner can increase the number of

         // blocks, and so will amplify any per-block overhead.

         m_block_dimensions[i] = numext::mini(dim_size_target, m_tensor_dimensions[i]);

       }


       // Add any un-allocated coefficients to inner dimension(s).

       IndexType total_size = m_block_dimensions.TotalSize();

       for (int i = 0; i < NumDims; ++i) {

         const int dim = isColMajor ? i : NumDims - i - 1;


         if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {

           const IndexType total_size_other_dims = total_size / m_block_dimensions[dim];

           const IndexType alloc_avail = numext::div_ceil<IndexType>(target_block_size, total_size_other_dims);

           if (alloc_avail == m_block_dimensions[dim]) {

             // Insufficient excess coefficients to allocate.

             break;

           }

           m_block_dimensions[dim] = numext::mini(m_tensor_dimensions[dim], alloc_avail);

           total_size = total_size_other_dims * m_block_dimensions[dim];

         }

       }


     } else {

       eigen_assert(false);  // unknown block shape

     }


     eigen_assert(m_block_dimensions.TotalSize() >=

                  numext::mini<IndexType>(target_block_size, m_tensor_dimensions.TotalSize()));


     // Calculate block counts by dimension and total block count.

     DSizes<IndexType, NumDims> block_count;

     for (int i = 0; i < NumDims; ++i) {

       block_count[i] = numext::div_ceil(m_tensor_dimensions[i], m_block_dimensions[i]);

     }

     m_total_block_count = array_prod(block_count);


     // Calculate block strides (used for enumerating blocks).

     m_tensor_strides = strides<Layout>(m_tensor_dimensions);

     m_block_strides = strides<Layout>(block_count);

   }


   DSizes<IndexType, NumDims> m_tensor_dimensions;

   TensorBlockResourceRequirements m_requirements;


   DSizes<IndexType, NumDims> m_block_dimensions;

   IndexType m_total_block_count;


   DSizes<IndexType, NumDims> m_tensor_strides;

   DSizes<IndexType, NumDims> m_block_strides;

 };


 // -------------------------------------------------------------------------- //

 // TensorBlockScratchAllocator is responsible for allocating temporary buffers

 // for block evaluation (output or input block materialization). Given that

 // Eigen expression traversal order is deterministic, all temporary allocations

 // are happening in the same order, and usually have exactly the same size.

 // Scratch allocator keeps a trace of all dynamic allocations, and after the

 // first block evaluation is completed, we should be able to reuse all the

 // temporary buffers for the next block evaluation.


 template <typename Device>

 class TensorBlockScratchAllocator {

  public:

   explicit TensorBlockScratchAllocator(const Device& device) : m_device(device), m_allocation_index(0) {}


   ~TensorBlockScratchAllocator() {

     for (size_t i = 0; i < m_allocations.size(); ++i) {

       m_device.deallocate(m_allocations[i].ptr);

     }

   }


   void* allocate(size_t size) {

     // TODO(ezhulenev): Remove when replaced with inlined vector.

     if (m_allocations.capacity() == 0) m_allocations.reserve(8);


     // Check if we already have an existing allocation att current index.

     const int num_allocations = static_cast<int>(m_allocations.size());

     const bool has_allocation = m_allocation_index < num_allocations;


     // Allocation index can't be larger than the number of allocations.

     eigen_assert(m_allocation_index <= num_allocations);


     // If we have existing allocation, and its size is larger or equal to

     // requested size, we do nothing.


     // If current allocation can't fit requested size, we deallocate it, and

     // replace with a larger allocation.

     if (has_allocation && m_allocations[m_allocation_index].size < size) {

       m_device.deallocate(m_allocations[m_allocation_index].ptr);

       m_allocations[m_allocation_index].ptr = m_device.allocate(size);

       m_allocations[m_allocation_index].size = size;

     }


     // Make a new allocation if we don't have and existing one.

     if (!has_allocation) {

       Allocation allocation;

       allocation.ptr = m_device.allocate(size);

       allocation.size = size;

       m_allocations.push_back(allocation);

     }


     eigen_assert(m_allocations[m_allocation_index].ptr != NULL);

     eigen_assert(m_allocations[m_allocation_index].size >= size);


     return m_allocations[m_allocation_index++].ptr;

   }


   void reset() { m_allocation_index = 0; }


  private:

   struct Allocation {

     void* ptr;

     size_t size;

   };


   const Device& m_device;

   int m_allocation_index;

   // TODO(ezhulenev): This should be an inlined vector.

   std::vector<Allocation> m_allocations;

 };


 // -------------------------------------------------------------------------- //

 // TensorBlockKind represents all possible block kinds, that can be produced by

 // TensorEvaluator::evalBlock function.

 enum TensorBlockKind {

   // Tensor block that is a lazy expression that must be assigned to a

   // destination using TensorBlockAssign.

   kExpr,


   // Tensor block that is a view into a memory buffer owned by an underlying

   // Tensor expression (e.g. it can be a view into a Tensor buffer).

   kView,


   // Tensor block that was materialized in a scratch memory buffer, allocated

   // with TensorBlockScratchAllocator. This block must be copied to a

   // destination, similar to a block of `kExpr` type.

   kMaterializedInScratch,


   // Tensor block that was materialized directly into the final output memory

   // buffer. For example if the left side of an assignment is a Tensor, we can

   // directly materialize the block in the destination memory.

   //

   // If strides in the output buffer do not match tensor block strides, the

   // Tensor expression will be invalid, and should not be used by

   // TensorBlockAssign or for constructing another block expression.

   kMaterializedInOutput

 };


 // -------------------------------------------------------------------------- //

 // TensorBlockNotImplemented should be used to defined TensorBlock typedef in

 // TensorEvaluators that do not support block evaluation.


 class TensorBlockNotImplemented {

  public:

   typedef void XprType;

 };


 // -------------------------------------------------------------------------- //

 // XprScalar extracts Scalar type from the Eigen expressions (if expression type

 // is not void). It's required to be able to define lazy block expression for

 // argument types, that do not support block evaluation.


 template <typename XprType>

 struct XprScalar {

   typedef typename XprType::Scalar type;

 };

 template <>

 struct XprScalar<void> {

   typedef void type;

 };


 // -------------------------------------------------------------------------- //

 // TensorMaterializedBlock is a fully evaluated block of the original tensor,

 // and XprType is just a TensorMap over the data. This block type is typically

 // used to materialize blocks of tensor expressions, that can't be efficiently

 // represented as lazy Tensor expressions with fast coeff/packet operations,

 // e.g. we materialize all broadcasts into evaluated blocks.

 //

 // TensorMaterializedBlock does not own its memory buffer, it's either a memory

 // buffer that backs the original expression (e.g. block is just a view into a

 // Tensor), or a memory buffer allocated with scratch allocator, and in this

 // case the scratch allocator will deallocate it at the end of block based

 // expression execution.

 //

 // If the block was evaluated directly into the output buffer, and strides in

 // the output buffer do not match block strides, the TensorMap expression will

 // be invalid, and should never be used in block assignment or any other tensor

 // expression.


 template <typename Scalar, int NumDims, int Layout, typename IndexType = Eigen::Index>

 class TensorMaterializedBlock {

  public:

   typedef DSizes<IndexType, NumDims> Dimensions;

   typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;


   TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, const Dimensions& dimensions,

                           bool valid_expr = true)

       : m_kind(kind), m_data(data), m_dimensions(dimensions), m_expr(m_data, m_dimensions), m_valid_expr(valid_expr) {

     eigen_assert(m_kind == internal::TensorBlockKind::kView ||

                  m_kind == internal::TensorBlockKind::kMaterializedInScratch ||

                  m_kind == internal::TensorBlockKind::kMaterializedInOutput);

   }


   TensorBlockKind kind() const { return m_kind; }

   // NOTE(ezhulenev): Returning XprType by value like in other block types

   // causes asan failures. The theory is that XprType::Nested doesn't work

   // properly for TensorMap.

   const XprType& expr() const {

     eigen_assert(m_valid_expr);

     return m_expr;

   }

   const Scalar* data() const { return m_data; }

   void cleanup() {}


   typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;


   // TensorMaterializedBlock can be backed by different types of storage:

   //

   //   (1) Contiguous block of memory allocated with scratch allocator.

   //   (2) Contiguous block of memory reused from tensor block descriptor

   //       destination buffer.

   //   (3) Strided block of memory reused from tensor block descriptor

   //       destination buffer.

   //

   class Storage {

    public:

     Scalar* data() const { return m_data; }

     const Dimensions& dimensions() const { return m_dimensions; }

     const Dimensions& strides() const { return m_strides; }


     TensorMaterializedBlock AsTensorMaterializedBlock() const {

       return TensorMaterializedBlock(m_materialized_in_output ? internal::TensorBlockKind::kMaterializedInOutput

                                                               : internal::TensorBlockKind::kMaterializedInScratch,

                                      m_data, m_dimensions, !m_strided_storage);

     }


    private:

     friend class TensorMaterializedBlock<Scalar, NumDims, Layout, IndexType>;


     Storage(Scalar* data, const Dimensions& dimensions, const Dimensions& strides, bool materialized_in_output,

             bool strided_storage)

         : m_data(data),

           m_dimensions(dimensions),

           m_strides(strides),

           m_materialized_in_output(materialized_in_output),

           m_strided_storage(strided_storage) {}


     Scalar* m_data;

     Dimensions m_dimensions;

     Dimensions m_strides;

     bool m_materialized_in_output;

     bool m_strided_storage;

   };


   // Creates a storage for materialized block either from the block descriptor

   // destination buffer, or allocates a new buffer with scratch allocator.

   template <typename TensorBlockScratch>

   EIGEN_STRONG_INLINE static Storage prepareStorage(TensorBlockDesc& desc, TensorBlockScratch& scratch,

                                                     bool allow_strided_storage = false) {

     // Try to reuse destination as an output block buffer.

     typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;


     if (desc.destination().kind() == DestinationBuffer::kContiguous) {

       Scalar* buffer = desc.destination().template data<Scalar>();

       desc.DropDestinationBuffer();

       return Storage(buffer, desc.dimensions(), internal::strides<Layout>(desc.dimensions()),

                      /*materialized_in_output=*/true,

                      /*strided_storage=*/false);


     } else if (desc.destination().kind() == DestinationBuffer::kStrided && allow_strided_storage) {

       Scalar* buffer = desc.destination().template data<Scalar>();

       desc.DropDestinationBuffer();

       return Storage(buffer, desc.dimensions(), desc.destination().strides(),

                      /*materialized_in_output=*/true, /*strided_storage=*/true);


     } else {

       void* mem = scratch.allocate(desc.size() * sizeof(Scalar));

       return Storage(static_cast<Scalar*>(mem), desc.dimensions(), internal::strides<Layout>(desc.dimensions()),

                      /*materialized_in_output=*/false,

                      /*strided_storage=*/false);

     }

   }


   // Creates a materialized block for the given descriptor from a memory buffer.

   template <typename DataDimensions, typename TensorBlockScratch>

   EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(const Scalar* data, const DataDimensions& data_dims,

                                                                  TensorBlockDesc& desc, TensorBlockScratch& scratch) {

     eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());


     // If a tensor block dimensions covers a contiguous block of the underlying

     // memory, we can skip block buffer memory allocation, and construct a block

     // from existing `data` memory buffer.

     //

     // Example: (RowMajor layout)

     //   data_dims:          [11, 12, 13, 14]

     //   desc.dimensions():  [1,   1,  3, 14]

     //

     // In this case we can construct a TensorBlock starting at

     // `data + desc.offset()`, with a `desc.dimensions()` block sizes.

     static const bool is_col_major = Layout == ColMajor;


     // Find out how many inner dimensions have a matching size.

     int num_matching_inner_dims = 0;

     for (int i = 0; i < NumDims; ++i) {

       int dim = is_col_major ? i : NumDims - i - 1;

       if (data_dims[dim] != desc.dimensions()[dim]) break;

       ++num_matching_inner_dims;

     }


     // All the outer dimensions must be of size `1`, except a single dimension

     // before the matching inner dimension (`3` in the example above).

     bool can_use_direct_access = true;

     for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {

       int dim = is_col_major ? i : NumDims - i - 1;

       if (desc.dimension(dim) != 1) {

         can_use_direct_access = false;

         break;

       }

     }


     if (can_use_direct_access) {

       const Scalar* block_start = data + desc.offset();

       return TensorMaterializedBlock(internal::TensorBlockKind::kView, block_start, desc.dimensions());


     } else {

       // Reuse destination buffer or allocate new buffer with scratch allocator.

       const Storage storage = prepareStorage(desc, scratch);


       typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout> TensorBlockIO;

       typedef typename TensorBlockIO::Dst TensorBlockIODst;

       typedef typename TensorBlockIO::Src TensorBlockIOSrc;


       TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)), data, desc.offset());

       TensorBlockIODst dst(storage.dimensions(), storage.strides(), storage.data());


       TensorBlockIO::Copy(dst, src);

       return storage.AsTensorMaterializedBlock();

     }

   }


  private:

   TensorBlockKind m_kind;

   const Scalar* m_data;

   Dimensions m_dimensions;

   XprType m_expr;

   bool m_valid_expr;

 };


 // -------------------------------------------------------------------------- //

 // TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp

 // functor to the blocks produced by the underlying Tensor expression.


 template <typename UnaryOp, typename ArgTensorBlock>

 class TensorCwiseUnaryBlock {

   static constexpr bool NoArgBlockAccess = internal::is_void<typename ArgTensorBlock::XprType>::value;


  public:

   typedef std::conditional_t<NoArgBlockAccess, void,

                              TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >

       XprType;


   typedef typename XprScalar<XprType>::type Scalar;


   TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)

       : m_arg_block(arg_block), m_functor(functor) {}


   TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }


   XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }

   const Scalar* data() const { return NULL; }

   void cleanup() { m_arg_block.cleanup(); }


  private:

   ArgTensorBlock m_arg_block;

   UnaryOp m_functor;

 };


 // -------------------------------------------------------------------------- //

 // TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp

 // functor to the blocks produced by the underlying Tensor expression.


 template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>

 class TensorCwiseBinaryBlock {

   static constexpr bool NoArgBlockAccess = internal::is_void<typename LhsTensorBlock::XprType>::value ||

                                            internal::is_void<typename RhsTensorBlock::XprType>::value;


  public:

   typedef std::conditional_t<

       NoArgBlockAccess, void,

       TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType, const typename RhsTensorBlock::XprType> >

       XprType;


   typedef typename XprScalar<XprType>::type Scalar;


   TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, const RhsTensorBlock& right_block, const BinaryOp& functor)

       : m_left_block(left_block), m_right_block(right_block), m_functor(functor) {}


   TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }


   XprType expr() const { return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); }


   const Scalar* data() const { return NULL; }


   void cleanup() {

     m_left_block.cleanup();

     m_right_block.cleanup();

   }


  private:

   LhsTensorBlock m_left_block;

   RhsTensorBlock m_right_block;

   BinaryOp m_functor;

 };


 // -------------------------------------------------------------------------- //

 // TensorUnaryExprBlock is a lazy tensor expression block that can construct

 // an arbitrary tensor expression from a block of the underlying type (this is a

 // generalization of the TensorCwiseUnaryBlock for arbitrary expressions).


 template <typename BlockFactory, typename ArgTensorBlock>

 class TensorUnaryExprBlock {

   typedef typename ArgTensorBlock::XprType ArgXprType;

   static constexpr bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;


  public:

   typedef std::conditional_t<NoArgBlockAccess, void, typename BlockFactory::template XprType<ArgXprType>::type> XprType;


   typedef typename XprScalar<XprType>::type Scalar;


   TensorUnaryExprBlock(const ArgTensorBlock& arg_block, const BlockFactory& factory)

       : m_arg_block(arg_block), m_factory(factory) {}


   TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }

   XprType expr() const { return m_factory.expr(m_arg_block.expr()); }

   const Scalar* data() const { return NULL; }

   void cleanup() { m_arg_block.cleanup(); }


  private:

   ArgTensorBlock m_arg_block;

   BlockFactory m_factory;

 };


 // -------------------------------------------------------------------------- //

 // TensorTernaryExprBlock is a lazy tensor expression block that can construct

 // an arbitrary tensor expression from three blocks of the underlying type.


 template <typename BlockFactory, typename Arg1TensorBlock, typename Arg2TensorBlock, typename Arg3TensorBlock>

 class TensorTernaryExprBlock {

   typedef typename Arg1TensorBlock::XprType Arg1XprType;

   typedef typename Arg2TensorBlock::XprType Arg2XprType;

   typedef typename Arg3TensorBlock::XprType Arg3XprType;


   static constexpr bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||

                                            internal::is_void<Arg2XprType>::value ||

                                            internal::is_void<Arg3XprType>::value;


  public:

   typedef std::conditional_t<NoArgBlockAccess, void,

                              typename BlockFactory::template XprType<Arg1XprType, Arg2XprType, Arg3XprType>::type>

       XprType;


   typedef typename XprScalar<XprType>::type Scalar;


   TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, const Arg2TensorBlock& arg2_block,

                          const Arg3TensorBlock& arg3_block, const BlockFactory& factory)

       : m_arg1_block(arg1_block), m_arg2_block(arg2_block), m_arg3_block(arg3_block), m_factory(factory) {}


   TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }

   XprType expr() const { return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), m_arg3_block.expr()); }

   const Scalar* data() const { return NULL; }

   void cleanup() {

     m_arg1_block.cleanup();

     m_arg2_block.cleanup();

     m_arg3_block.cleanup();

   }


  private:

   Arg1TensorBlock m_arg1_block;

   Arg2TensorBlock m_arg2_block;

   Arg3TensorBlock m_arg3_block;

   BlockFactory m_factory;

 };


 // -------------------------------------------------------------------------- //

 // StridedLinearBufferCopy provides a method to copy data between two linear

 // buffers with different strides, with optimized paths for scatter/gather.


 template <typename Scalar, typename IndexType>

 class StridedLinearBufferCopy {

   typedef typename packet_traits<Scalar>::type Packet;

   typedef typename unpacket_traits<Packet>::half HalfPacket;

   enum {

     Vectorizable = packet_traits<Scalar>::Vectorizable,

     PacketSize = packet_traits<Scalar>::size,

     HalfPacketSize = unpacket_traits<HalfPacket>::size,

     HasHalfPacket = static_cast<int>(HalfPacketSize) < static_cast<int>(PacketSize)

   };


  public:

   // Specifying linear copy kind statically gives ~30% speedup for small sizes.

   enum class Kind {

     Linear = 0,       // src_stride == 1 && dst_stride == 1

     Scatter = 1,      // src_stride == 1 && dst_stride != 1

     FillLinear = 2,   // src_stride == 0 && dst_stride == 1

     FillScatter = 3,  // src_stride == 0 && dst_stride != 1

     Gather = 4,       // dst_stride == 1

     Random = 5        // everything else

   };


   struct Dst {

     Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}


     IndexType offset;

     IndexType stride;

     Scalar* data;

   };


   struct Src {

     Src(IndexType o, IndexType s, const Scalar* d) : offset(o), stride(s), data(d) {}


     IndexType offset;

     IndexType stride;

     const Scalar* data;

   };


   template <typename StridedLinearBufferCopy::Kind kind>

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, const Src& src, const size_t count) {

     Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, src.data);

   }


  private:

   template <typename StridedLinearBufferCopy::Kind kind>

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const IndexType count, const IndexType dst_offset,

                                                         const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,

                                                         const IndexType src_offset, const IndexType src_stride,

                                                         const Scalar* EIGEN_RESTRICT src_data) {

     const Scalar* src = &src_data[src_offset];

     Scalar* dst = &dst_data[dst_offset];


     if (!Vectorizable) {

       for (Index i = 0; i < count; ++i) {

         dst[i * dst_stride] = src[i * src_stride];

       }

       return;

     }


     const IndexType vectorized_size = PacketSize * (count / PacketSize);

     IndexType i = 0;


     if (kind == StridedLinearBufferCopy::Kind::Linear) {

       // ******************************************************************** //

       // Linear copy from `src` to `dst`.

       const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));

       eigen_assert(src_stride == 1 && dst_stride == 1);

       for (; i < unrolled_size; i += 4 * PacketSize) {

         for (int j = 0; j < 4; ++j) {

           Packet p = ploadu<Packet>(src + i + j * PacketSize);

           pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);

         }

       }

       for (; i < vectorized_size; i += PacketSize) {

         Packet p = ploadu<Packet>(src + i);

         pstoreu<Scalar, Packet>(dst + i, p);

       }

       if (HasHalfPacket) {

         const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);

         if (i < vectorized_half_size) {

           HalfPacket p = ploadu<HalfPacket>(src + i);

           pstoreu<Scalar, HalfPacket>(dst + i, p);

           i += HalfPacketSize;

         }

       }

       for (; i < count; ++i) {

         dst[i] = src[i];

       }

       // ******************************************************************** //

     } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {

       // Scatter from `src` to `dst`.

       eigen_assert(src_stride == 1 && dst_stride != 1);

       for (; i < vectorized_size; i += PacketSize) {

         Packet p = ploadu<Packet>(src + i);

         pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);

       }

       if (HasHalfPacket) {

         const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);

         if (i < vectorized_half_size) {

           HalfPacket p = ploadu<HalfPacket>(src + i);

           pscatter<Scalar, HalfPacket>(dst + i * dst_stride, p, dst_stride);

           i += HalfPacketSize;

         }

       }

       for (; i < count; ++i) {

         dst[i * dst_stride] = src[i];

       }

       // ******************************************************************** //

     } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {

       // Fill `dst` with value at `*src`.

       eigen_assert(src_stride == 0 && dst_stride == 1);


       const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));

       Scalar s = *src;

       Packet p = pset1<Packet>(s);

       for (; i < unrolled_size; i += 4 * PacketSize) {

         for (int j = 0; j < 4; ++j) {

           pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);

         }

       }

       for (; i < vectorized_size; i += PacketSize) {

         pstoreu<Scalar, Packet>(dst + i, p);

       }

       if (HasHalfPacket) {

         const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);

         if (i < vectorized_half_size) {

           HalfPacket hp = pset1<HalfPacket>(s);

           pstoreu<Scalar, HalfPacket>(dst + i, hp);

           i += HalfPacketSize;

         }

       }

       for (; i < count; ++i) {

         dst[i] = s;

       }

       // ******************************************************************** //

     } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {

       // Scatter `*src` into `dst`.

       eigen_assert(src_stride == 0 && dst_stride != 1);

       Scalar s = *src;

       Packet p = pset1<Packet>(s);

       for (; i < vectorized_size; i += PacketSize) {

         pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);

       }

       if (HasHalfPacket) {

         const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);

         if (i < vectorized_half_size) {

           HalfPacket hp = pset1<HalfPacket>(s);

           pscatter<Scalar, HalfPacket>(dst + i * dst_stride, hp, dst_stride);

           i += HalfPacketSize;

         }

       }

       for (; i < count; ++i) {

         dst[i * dst_stride] = s;

       }

       // ******************************************************************** //

     } else if (kind == StridedLinearBufferCopy::Kind::Gather) {

       // Gather from `src` into `dst`.

       eigen_assert(dst_stride == 1);

       for (; i < vectorized_size; i += PacketSize) {

         Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);

         pstoreu<Scalar, Packet>(dst + i, p);

       }

       if (HasHalfPacket) {

         const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);

         if (i < vectorized_half_size) {

           HalfPacket p = pgather<Scalar, HalfPacket>(src + i * src_stride, src_stride);

           pstoreu<Scalar, HalfPacket>(dst + i, p);

           i += HalfPacketSize;

         }

       }

       for (; i < count; ++i) {

         dst[i] = src[i * src_stride];

       }

       // ******************************************************************** //

     } else if (kind == StridedLinearBufferCopy::Kind::Random) {

       // Random.

       for (; i < count; ++i) {

         dst[i * dst_stride] = src[i * src_stride];

       }

     } else {

       eigen_assert(false);

     }

   }

 };


 // -------------------------------------------------------------------------- //

 // TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.

 // It's possible to specify src->dst dimension mapping for the copy operation.

 // Dimensions of `dst` specify how many elements have to be copied, for the

 // `src` we need to know only stride to navigate through source memory buffer.


 template <typename Scalar, typename IndexType, int NumDims, int Layout>

 class TensorBlockIO {

   static constexpr bool IsColMajor = (Layout == ColMajor);


   typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;


  public:

   typedef DSizes<IndexType, NumDims> Dimensions;

   typedef DSizes<int, NumDims> DimensionsMap;


   struct Dst {

     Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, IndexType dst_offset = 0)

         : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}


     Dimensions dims;

     Dimensions strides;

     Scalar* data;

     IndexType offset;

   };


   struct Src {

     Src(const Dimensions& src_strides, const Scalar* src, IndexType src_offset = 0)

         : strides(src_strides), data(src), offset(src_offset) {}


     Dimensions strides;

     const Scalar* data;

     IndexType offset;

   };


   // Copies data to `dst` from `src`, using provided dimensions mapping:

   //

   //   src_dimension_index = dst_to_src_dim_map[dst_dimension_index]

   //

   // Returns the number of copied elements.

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(const Dst& dst, const Src& src,

                                                               const DimensionsMap& dst_to_src_dim_map) {

     // Copy single scalar value from `src` to `dst`.

     if (NumDims == 0) {

       *(dst.data + dst.offset) = *(src.data + src.offset);

       return 1;

     }


     // Both `dst` and `src` must have contiguous innermost dimension. We also

     // accept the special case with stride '0', because it's used as a trick to

     // implement broadcasting.

     {

       int inner_dim = IsColMajor ? 0 : NumDims - 1;

       EIGEN_UNUSED_VARIABLE(inner_dim);

       eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);

       eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);

     }


     // Give a shorter name to `dst_to_src_dim_map`.

     const DimensionsMap& dim_map = dst_to_src_dim_map;


     // Do not squeeze reordered inner dimensions.

     int num_squeezable_dims = NumSqueezableInnerDims(dim_map);


     // NOTE: We find the innermost dimension (contiguous in memory) in the dst

     // block, and we write data linearly into that dimension, reading it from

     // the src. If dimensions are reordered, we might end up reading data from

     // the src with `stride != 1`.

     //

     // NOTE: Random-Read/Linear-Write can be up to ~2X faster than

     // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680


     // Find the innermost dimension in the dst whose size is not 1. This is the

     // effective inner dim.

     int num_size_one_inner_dims = 0;

     for (int i = 0; i < num_squeezable_dims; ++i) {

       const int dst_dim = IsColMajor ? i : NumDims - i - 1;

       if (dst.dims[dst_dim] != 1) break;

       num_size_one_inner_dims++;

     }


     // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.

     if (num_size_one_inner_dims == NumDims) {

       *(dst.data + dst.offset) = *(src.data + src.offset);

       return 1;

     }


     // Outermost dimension in the dst with `stride == 1` (contiguous in memory).

     const int dst_stride1_dim = IsColMajor ? num_size_one_inner_dims : NumDims - num_size_one_inner_dims - 1;


     // Dimension in the src that corresponds to the dst innermost dimension.

     const int src_dim_for_dst_stride1_dim = NumDims == 0 ? 1 : dim_map[dst_stride1_dim];


     // Size of the innermost dimension (length of contiguous blocks of memory).

     IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];


     // Squeeze multiple inner dims into one if they are contiguous in `dst` and

     // `src` memory, so we can do less linear copy calls.

     for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {

       const int dst_dim = IsColMajor ? i : NumDims - i - 1;

       const IndexType dst_stride = dst.strides[dst_dim];

       const IndexType src_stride = src.strides[dim_map[dst_dim]];

       if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {

         dst_inner_dim_size *= dst.dims[dst_dim];

         ++num_size_one_inner_dims;

       } else {

         break;

       }

     }


     // Setup strides to read data from `src` and write to `dst`.

     IndexType input_offset = src.offset;

     IndexType output_offset = dst.offset;

     IndexType input_stride = NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];

     IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];


     const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;

     array<BlockIteratorState, at_least_1_dim> it;


     // Initialize block iterator state. Squeeze away any dimension of size 1.

     int idx = 0;  // currently initialized iterator state index

     for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {

       const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;

       if (dst.dims[dst_dim] == 1) continue;


       it[idx].size = dst.dims[dst_dim];

       it[idx].input_stride = src.strides[dim_map[dst_dim]];

       it[idx].output_stride = dst.strides[dst_dim];


       it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);

       it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);


       idx++;

     }


     // Iterate copying data from src to dst.

     const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();


 #define COPY_INNER_DIM(KIND)                                                                                      \

   IndexType num_copied = 0;                                                                                       \

   for (num_copied = 0; num_copied < block_total_size; num_copied += dst_inner_dim_size) {                         \

     LinCopy::template Run<KIND>(typename LinCopy::Dst(output_offset, output_stride, dst.data),                    \

                                 typename LinCopy::Src(input_offset, input_stride, src.data), dst_inner_dim_size); \

                                                                                                                   \

     for (int j = 0; j < idx; ++j) {                                                                               \

       if (++it[j].count < it[j].size) {                                                                           \

         input_offset += it[j].input_stride;                                                                       \

         output_offset += it[j].output_stride;                                                                     \

         break;                                                                                                    \

       }                                                                                                           \

       it[j].count = 0;                                                                                            \

       input_offset -= it[j].input_span;                                                                           \

       output_offset -= it[j].output_span;                                                                         \

     }                                                                                                             \

   }                                                                                                               \

   return num_copied;


     if (input_stride == 1 && output_stride == 1) {

       COPY_INNER_DIM(LinCopy::Kind::Linear);

     } else if (input_stride == 1 && output_stride != 1) {

       COPY_INNER_DIM(LinCopy::Kind::Scatter);

     } else if (input_stride == 0 && output_stride == 1) {

       COPY_INNER_DIM(LinCopy::Kind::FillLinear);

     } else if (input_stride == 0 && output_stride != 1) {

       COPY_INNER_DIM(LinCopy::Kind::FillScatter);

     } else if (output_stride == 1) {

       COPY_INNER_DIM(LinCopy::Kind::Gather);

     } else {

       COPY_INNER_DIM(LinCopy::Kind::Random);

     }


 #undef COPY_INNER_DIM

   }


   // Copy from `src` to `dst` with an identity src->dst dimension map. Returns

   // the number of copied elements.

   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, const Src& src) {

     DimensionsMap dst_to_src_map;

     for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;

     return Copy(dst, src, dst_to_src_map);

   }


  private:

   struct BlockIteratorState {

     BlockIteratorState() : size(0), count(0), input_stride(0), output_stride(0), input_span(0), output_span(0) {}


     IndexType size;

     IndexType count;

     IndexType input_stride;

     IndexType output_stride;

     IndexType input_span;

     IndexType output_span;

   };


   // Compute how many inner dimensions it's allowed to squeeze when doing IO

   // between two tensor blocks. It's safe to squeeze inner dimensions, only

   // if they are not reordered.

   static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {

     int num_squeezable_dims = 0;

     for (int i = 0; i < NumDims; ++i) {

       const int dim = IsColMajor ? i : NumDims - i - 1;

       if (dim_map[dim] != dim) break;

       num_squeezable_dims++;

     }

     return num_squeezable_dims;

   }

 };


 // -------------------------------------------------------------------------- //

 // TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to

 // a Tensor block defined by `desc`, backed by a memory buffer at `target`.

 //

 // Currently there is no way to write from a Tensor expression to a block of

 // memory, if dimensions are reordered. If you need to do that, you should

 // materialize a Tensor block expression into a memory buffer, and then use

 // TensorBlockIO to copy data between two memory buffers with a custom

 // `target->src` dimension map (see definition above).

 //

 // Also currently the innermost dimension of `target` must have a stride '1'

 // (contiguous in memory). This restriction could be lifted with a `pscatter`,

 // but in practice it's never needed, and there is a similar TensorBlockIO

 // workaround for that.

 //

 // TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO

 // where `src` is a tensor expression. Explore if it is possible to rewrite IO

 // to use expressions instead of pointers, and after that TensorBlockAssignment

 // will become an alias to IO.

 template <typename Scalar, int NumDims, typename TensorBlockExpr, typename IndexType = Eigen::Index>

 class TensorBlockAssignment {

   // We will use coeff/packet path to evaluate block expressions.

   typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice> TensorBlockEvaluator;


   typedef DSizes<IndexType, NumDims> Dimensions;


   enum { Vectorizable = packet_traits<Scalar>::Vectorizable, PacketSize = packet_traits<Scalar>::size };


   template <bool Vectorizable, typename Evaluator>

   struct InnerDimAssign {

     EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) {

       for (IndexType i = 0; i < count; ++i) {

         target[i] = eval.coeff(eval_offset + i);

       }

     }

   };


   template <typename Evaluator>

   struct InnerDimAssign<true, Evaluator> {

     EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) {

       typedef typename packet_traits<Scalar>::type Packet;


       const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));

       const IndexType vectorized_size = PacketSize * (count / PacketSize);

       IndexType i = 0;


       for (; i < unrolled_size; i += 4 * PacketSize) {

         for (int j = 0; j < 4; ++j) {

           const IndexType idx = eval_offset + i + j * PacketSize;

           Packet p = eval.template packet<Unaligned>(idx);

           pstoreu<Scalar>(target + i + j * PacketSize, p);

         }

       }


       for (; i < vectorized_size; i += PacketSize) {

         Packet p = eval.template packet<Unaligned>(eval_offset + i);

         pstoreu<Scalar>(target + i, p);

       }


       for (; i < count; ++i) {

         target[i] = eval.coeff(eval_offset + i);

       }

     }

   };


  public:

   struct Target {

     Target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data,

            IndexType target_offset = 0)

         : dims(target_dims), strides(target_strides), data(target_data), offset(target_offset) {}


     Dimensions dims;

     Dimensions strides;

     Scalar* data;

     IndexType offset;

   };


   static Target target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data,

                        IndexType target_offset = 0) {

     return Target(target_dims, target_strides, target_data, target_offset);

   }


   template <typename TargetDimsIndexType, typename TargetStridesIndexType>

   static Target target(const DSizes<TargetDimsIndexType, NumDims>& target_dims,

                        const DSizes<TargetStridesIndexType, NumDims>& target_strides, Scalar* target_data,

                        IndexType target_offset = 0) {

     // DSizes constructor will do index type promotion if it's safe.

     return Target(Dimensions(target_dims), Dimensions(target_strides), target_data, target_offset);

   }


   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Target& target, const TensorBlockExpr& expr) {

     // Prepare evaluator for block expression.

     DefaultDevice default_device;

     TensorBlockEvaluator eval(expr, default_device);


     // Tensor block expression dimension should match destination dimensions.

     eigen_assert(dimensions_match(target.dims, eval.dimensions()));


     static const int Layout = TensorBlockEvaluator::Layout;

     static const bool is_col_major = Layout == ColMajor;


     // Initialize output inner dimension size based on a layout.

     const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();

     const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;

     IndexType output_inner_dim_size = target.dims[inner_dim_idx];


     // Target inner dimension stride must be '1'.

     eigen_assert(target.strides[inner_dim_idx] == 1);


     // Squeeze multiple inner dims into one if they are contiguous in `target`.

     IndexType num_squeezed_dims = 0;

     for (Index i = 1; i < NumDims; ++i) {

       const Index dim = is_col_major ? i : NumDims - i - 1;

       const IndexType target_stride = target.strides[dim];


       if (output_inner_dim_size == target_stride) {

         output_inner_dim_size *= target.dims[dim];

         num_squeezed_dims++;

       } else {

         break;

       }

     }


     // Initialize output block iterator state. Dimension in this array are

     // always in inner_most -> outer_most order (col major layout).

     array<BlockIteratorState, NumDims> it;


     int idx = 0;  // currently initialized iterator state index

     for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {

       const Index dim = is_col_major ? i + 1 : NumDims - i - 2;


       it[idx].count = 0;

       it[idx].size = target.dims[dim];

       it[idx].output_stride = target.strides[dim];

       it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);

       idx++;

     }


     // We read block expression from the beginning, and start writing data to

     // `target` at given offset.

     IndexType input_offset = 0;

     IndexType output_offset = target.offset;


     // Iterate copying data from `eval` to `target`.

     for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {

       // Assign to `target` at current offset.

       InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess, TensorBlockEvaluator>::Run(

           target.data + output_offset, output_inner_dim_size, eval, input_offset);


       // Move input offset forward by the number of assigned coefficients.

       input_offset += output_inner_dim_size;


       // Update index.

       for (int j = 0; j < idx; ++j) {

         if (++it[j].count < it[j].size) {

           output_offset += it[j].output_stride;

           break;

         }

         it[j].count = 0;

         output_offset -= it[j].output_span;

       }

     }

   }


  private:

   struct BlockIteratorState {

     BlockIteratorState() : count(0), size(0), output_stride(0), output_span(0) {}


     IndexType count;

     IndexType size;

     IndexType output_stride;

     IndexType output_span;

   };

 };


 // -------------------------------------------------------------------------- //


 }  // namespace internal

 }  // namespace Eigen


 #endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H

i
int i
Definition: BiCGSTAB_step_by_step.cpp:9

EIGEN_RESTRICT
#define EIGEN_RESTRICT
Definition: Macros.h:1067

EIGEN_ALWAYS_INLINE
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845

EIGEN_UNUSED_VARIABLE
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:966

EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892

eigen_assert
#define eigen_assert(x)
Definition: Macros.h:910

EIGEN_STRONG_INLINE
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834

COPY_INNER_DIM
#define COPY_INNER_DIM(KIND)

p
float * p
Definition: Tutorial_Map_using.cpp:9

size
Scalar Scalar int size
Definition: benchVecAdd.cpp:17

Scalar
SCALAR Scalar
Definition: bench_gemm.cpp:45

Packet
internal::packet_traits< Scalar >::type Packet
Definition: benchmark-blocking-sizes.cpp:54

Eigen::TensorCwiseBinaryOp
Definition: TensorExpr.h:162

Eigen::TensorCwiseUnaryOp
Definition: TensorExpr.h:97

Eigen::TensorMap
A tensor expression mapping an existing array of data.
Definition: TensorMap.h:33

Eigen::TensorOpCost
Definition: TensorCostModel.h:28

Eigen::internal::StridedLinearBufferCopy
Definition: TensorBlock.h:902

Eigen::internal::StridedLinearBufferCopy::Packet
packet_traits< Scalar >::type Packet
Definition: TensorBlock.h:903

Eigen::internal::StridedLinearBufferCopy::Kind
Kind
Definition: TensorBlock.h:914

Eigen::internal::StridedLinearBufferCopy::Kind::Scatter
@ Scatter

Eigen::internal::StridedLinearBufferCopy::Kind::FillScatter
@ FillScatter

Eigen::internal::StridedLinearBufferCopy::Kind::Linear
@ Linear

Eigen::internal::StridedLinearBufferCopy::Kind::Random
@ Random

Eigen::internal::StridedLinearBufferCopy::Kind::Gather
@ Gather

Eigen::internal::StridedLinearBufferCopy::Kind::FillLinear
@ FillLinear

Eigen::internal::StridedLinearBufferCopy::Run
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst &dst, const Src &src, const size_t count)
Definition: TensorBlock.h:940

Eigen::internal::StridedLinearBufferCopy::HalfPacketSize
@ HalfPacketSize
Definition: TensorBlock.h:908

Eigen::internal::StridedLinearBufferCopy::PacketSize
@ PacketSize
Definition: TensorBlock.h:907

Eigen::internal::StridedLinearBufferCopy::Vectorizable
@ Vectorizable
Definition: TensorBlock.h:906

Eigen::internal::StridedLinearBufferCopy::Run
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const IndexType count, const IndexType dst_offset, const IndexType dst_stride, Scalar *EIGEN_RESTRICT dst_data, const IndexType src_offset, const IndexType src_stride, const Scalar *EIGEN_RESTRICT src_data)
Definition: TensorBlock.h:946

Eigen::internal::StridedLinearBufferCopy::HalfPacket
unpacket_traits< Packet >::half HalfPacket
Definition: TensorBlock.h:904

Eigen::internal::TensorBlockAssignment
Definition: TensorBlock.h:1314

Eigen::internal::TensorBlockAssignment::Dimensions
DSizes< IndexType, NumDims > Dimensions
Definition: TensorBlock.h:1318

Eigen::internal::TensorBlockAssignment::PacketSize
@ PacketSize
Definition: TensorBlock.h:1320

Eigen::internal::TensorBlockAssignment::Vectorizable
@ Vectorizable
Definition: TensorBlock.h:1320

Eigen::internal::TensorBlockAssignment::Run
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Target &target, const TensorBlockExpr &expr)
Definition: TensorBlock.h:1384

Eigen::internal::TensorBlockAssignment::target
static Target target(const Dimensions &target_dims, const Dimensions &target_strides, Scalar *target_data, IndexType target_offset=0)
Definition: TensorBlock.h:1371

Eigen::internal::TensorBlockAssignment::target
static Target target(const DSizes< TargetDimsIndexType, NumDims > &target_dims, const DSizes< TargetStridesIndexType, NumDims > &target_strides, Scalar *target_data, IndexType target_offset=0)
Definition: TensorBlock.h:1377

Eigen::internal::TensorBlockAssignment::TensorBlockEvaluator
TensorEvaluator< const TensorBlockExpr, DefaultDevice > TensorBlockEvaluator
Definition: TensorBlock.h:1316

Eigen::internal::TensorBlockDescriptor::DestinationBuffer
Definition: TensorBlock.h:185

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::make
static DestinationBuffer make(const TensorBlockDescriptor &desc, Scalar *data, const Dimensions &strides)
Definition: TensorBlock.h:237

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::kind
static DestinationBufferKind kind(const TensorBlockDescriptor &desc, const Dimensions &strides)
Definition: TensorBlock.h:242

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::m_kind
DestinationBufferKind m_kind
Definition: TensorBlock.h:261

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::DestinationBufferKind
DestinationBufferKind
Definition: TensorBlock.h:187

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::kEmpty
@ kEmpty
Definition: TensorBlock.h:197

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::kStrided
@ kStrided
Definition: TensorBlock.h:215

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::kContiguous
@ kContiguous
Definition: TensorBlock.h:203

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::m_data
void * m_data
Definition: TensorBlock.h:254

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::DestinationBuffer
DestinationBuffer()
Definition: TensorBlock.h:230

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::m_strides
Dimensions m_strides
Definition: TensorBlock.h:259

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::kind
const DestinationBufferKind & kind() const
Definition: TensorBlock.h:225

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::strides
const Dimensions & strides() const
Definition: TensorBlock.h:224

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::data
Scalar * data() const
Definition: TensorBlock.h:219

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::m_data_type_size
size_t m_data_type_size
Definition: TensorBlock.h:255

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::DestinationBuffer
DestinationBuffer(Scalar *data, const Dimensions &strides, DestinationBufferKind kind)
Definition: TensorBlock.h:233

Eigen::internal::TensorBlockDescriptor
Definition: TensorBlock.h:171

Eigen::internal::TensorBlockDescriptor::WithOffset
TensorBlockDescriptor WithOffset(IndexType offset) const
Definition: TensorBlock.h:298

Eigen::internal::TensorBlockDescriptor::AddDestinationBuffer
void AddDestinationBuffer(Scalar *dst_base, const DSizes< DstStridesIndexType, NumDims > &dst_strides)
Definition: TensorBlock.h:284

Eigen::internal::TensorBlockDescriptor::AddDestinationBuffer
void AddDestinationBuffer(Scalar *dst_base, const Dimensions &dst_strides)
Definition: TensorBlock.h:278

Eigen::internal::TensorBlockDescriptor::Dimensions
DSizes< IndexType, NumDims > Dimensions
Definition: TensorBlock.h:173

Eigen::internal::TensorBlockDescriptor::size
IndexType size() const
Definition: TensorBlock.h:273

Eigen::internal::TensorBlockDescriptor::destination
const DestinationBuffer & destination() const
Definition: TensorBlock.h:275

Eigen::internal::TensorBlockDescriptor::offset
IndexType offset() const
Definition: TensorBlock.h:270

Eigen::internal::TensorBlockDescriptor::HasDestinationBuffer
bool HasDestinationBuffer() const
Definition: TensorBlock.h:295

Eigen::internal::TensorBlockDescriptor::DropDestinationBuffer
TensorBlockDescriptor & DropDestinationBuffer()
Definition: TensorBlock.h:289

Eigen::internal::TensorBlockDescriptor::dimension
IndexType dimension(int index) const
Definition: TensorBlock.h:272

Eigen::internal::TensorBlockDescriptor::dimensions
const Dimensions & dimensions() const
Definition: TensorBlock.h:271

Eigen::internal::TensorBlockDescriptor::m_offset
const IndexType m_offset
Definition: TensorBlock.h:305

Eigen::internal::TensorBlockDescriptor::m_destination
DestinationBuffer m_destination
Definition: TensorBlock.h:307

Eigen::internal::TensorBlockDescriptor::m_dimensions
const Dimensions m_dimensions
Definition: TensorBlock.h:306

Eigen::internal::TensorBlockDescriptor::TensorBlockDescriptor
TensorBlockDescriptor(const IndexType offset, const Dimensions &dimensions, const DestinationBuffer &destination)
Definition: TensorBlock.h:264

Eigen::internal::TensorBlockDescriptor::TensorBlockDescriptor
TensorBlockDescriptor(const IndexType offset, const Dimensions &dimensions)
Definition: TensorBlock.h:267

Eigen::internal::TensorBlockIO
Definition: TensorBlock.h:1093

Eigen::internal::TensorBlockIO::NumSqueezableInnerDims
static int NumSqueezableInnerDims(const DimensionsMap &dim_map)
Definition: TensorBlock.h:1283

Eigen::internal::TensorBlockIO::IsColMajor
static constexpr bool IsColMajor
Definition: TensorBlock.h:1094

Eigen::internal::TensorBlockIO::Copy
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(const Dst &dst, const Src &src, const DimensionsMap &dst_to_src_dim_map)
Definition: TensorBlock.h:1126

Eigen::internal::TensorBlockIO::LinCopy
StridedLinearBufferCopy< Scalar, IndexType > LinCopy
Definition: TensorBlock.h:1096

Eigen::internal::TensorBlockIO::Dimensions
DSizes< IndexType, NumDims > Dimensions
Definition: TensorBlock.h:1099

Eigen::internal::TensorBlockIO::DimensionsMap
DSizes< int, NumDims > DimensionsMap
Definition: TensorBlock.h:1100

Eigen::internal::TensorBlockIO::Copy
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst &dst, const Src &src)
Definition: TensorBlock.h:1262

Eigen::internal::TensorBlockMapper
Definition: TensorBlock.h:314

Eigen::internal::TensorBlockMapper::m_total_block_count
IndexType m_total_block_count
Definition: TensorBlock.h:459

Eigen::internal::TensorBlockMapper::m_requirements
TensorBlockResourceRequirements m_requirements
Definition: TensorBlock.h:456

Eigen::internal::TensorBlockMapper::blockDimensions
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes< IndexType, NumDims > & blockDimensions() const
Definition: TensorBlock.h:331

Eigen::internal::TensorBlockMapper::TensorBlockMapper
TensorBlockMapper(const DSizes< IndexType, NumDims > &dimensions, const TensorBlockResourceRequirements &requirements)
Definition: TensorBlock.h:321

Eigen::internal::TensorBlockMapper::m_block_dimensions
DSizes< IndexType, NumDims > m_block_dimensions
Definition: TensorBlock.h:458

Eigen::internal::TensorBlockMapper::BlockDescriptor
TensorBlockDescriptor< NumDims, IndexType > BlockDescriptor
Definition: TensorBlock.h:315

Eigen::internal::TensorBlockMapper::m_tensor_strides
DSizes< IndexType, NumDims > m_tensor_strides
Definition: TensorBlock.h:461

Eigen::internal::TensorBlockMapper::blockCount
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const
Definition: TensorBlock.h:327

Eigen::internal::TensorBlockMapper::m_tensor_dimensions
DSizes< IndexType, NumDims > m_tensor_dimensions
Definition: TensorBlock.h:455

Eigen::internal::TensorBlockMapper::m_block_strides
DSizes< IndexType, NumDims > m_block_strides
Definition: TensorBlock.h:462

Eigen::internal::TensorBlockMapper::InitializeBlockDimensions
void InitializeBlockDimensions()
Definition: TensorBlock.h:359

Eigen::internal::TensorBlockMapper::Dimensions
DSizes< IndexType, NumDims > Dimensions
Definition: TensorBlock.h:318

Eigen::internal::TensorBlockMapper::TensorBlockMapper
TensorBlockMapper()=default

Eigen::internal::TensorBlockMapper::blockTotalSize
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const
Definition: TensorBlock.h:329

Eigen::internal::TensorBlockMapper::blockDescriptor
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index) const
Definition: TensorBlock.h:335

Eigen::internal::TensorBlockNotImplemented
Definition: TensorBlock.h:566

Eigen::internal::TensorBlockNotImplemented::XprType
void XprType
Definition: TensorBlock.h:568

Eigen::internal::TensorBlockScratchAllocator
Definition: TensorBlock.h:475

Eigen::internal::TensorBlockScratchAllocator::allocate
void * allocate(size_t size)
Definition: TensorBlock.h:485

Eigen::internal::TensorBlockScratchAllocator::m_allocation_index
int m_allocation_index
Definition: TensorBlock.h:530

Eigen::internal::TensorBlockScratchAllocator::m_device
const Device & m_device
Definition: TensorBlock.h:529

Eigen::internal::TensorBlockScratchAllocator::~TensorBlockScratchAllocator
~TensorBlockScratchAllocator()
Definition: TensorBlock.h:479

Eigen::internal::TensorBlockScratchAllocator::m_allocations
std::vector< Allocation > m_allocations
Definition: TensorBlock.h:532

Eigen::internal::TensorBlockScratchAllocator::TensorBlockScratchAllocator
TensorBlockScratchAllocator(const Device &device)
Definition: TensorBlock.h:477

Eigen::internal::TensorBlockScratchAllocator::reset
void reset()
Definition: TensorBlock.h:521

Eigen::internal::TensorCwiseBinaryBlock
Definition: TensorBlock.h:796

Eigen::internal::TensorCwiseBinaryBlock::Scalar
XprScalar< XprType >::type Scalar
Definition: TensorBlock.h:806

Eigen::internal::TensorCwiseBinaryBlock::TensorCwiseBinaryBlock
TensorCwiseBinaryBlock(const LhsTensorBlock &left_block, const RhsTensorBlock &right_block, const BinaryOp &functor)
Definition: TensorBlock.h:808

Eigen::internal::TensorCwiseBinaryBlock::kind
TensorBlockKind kind() const
Definition: TensorBlock.h:811

Eigen::internal::TensorCwiseBinaryBlock::NoArgBlockAccess
static constexpr bool NoArgBlockAccess
Definition: TensorBlock.h:797

Eigen::internal::TensorCwiseBinaryBlock::expr
XprType expr() const
Definition: TensorBlock.h:813

Eigen::internal::TensorCwiseBinaryBlock::m_right_block
RhsTensorBlock m_right_block
Definition: TensorBlock.h:824

Eigen::internal::TensorCwiseBinaryBlock::XprType
std::conditional_t< NoArgBlockAccess, void, TensorCwiseBinaryOp< BinaryOp, const typename LhsTensorBlock::XprType, const typename RhsTensorBlock::XprType > > XprType
Definition: TensorBlock.h:804

Eigen::internal::TensorCwiseBinaryBlock::cleanup
void cleanup()
Definition: TensorBlock.h:817

Eigen::internal::TensorCwiseBinaryBlock::data
const Scalar * data() const
Definition: TensorBlock.h:815

Eigen::internal::TensorCwiseBinaryBlock::m_left_block
LhsTensorBlock m_left_block
Definition: TensorBlock.h:823

Eigen::internal::TensorCwiseBinaryBlock::m_functor
BinaryOp m_functor
Definition: TensorBlock.h:825

Eigen::internal::TensorCwiseUnaryBlock
Definition: TensorBlock.h:767

Eigen::internal::TensorCwiseUnaryBlock::m_arg_block
ArgTensorBlock m_arg_block
Definition: TensorBlock.h:787

Eigen::internal::TensorCwiseUnaryBlock::NoArgBlockAccess
static constexpr bool NoArgBlockAccess
Definition: TensorBlock.h:768

Eigen::internal::TensorCwiseUnaryBlock::XprType
std::conditional_t< NoArgBlockAccess, void, TensorCwiseUnaryOp< UnaryOp, const typename ArgTensorBlock::XprType > > XprType
Definition: TensorBlock.h:773

Eigen::internal::TensorCwiseUnaryBlock::Scalar
XprScalar< XprType >::type Scalar
Definition: TensorBlock.h:775

Eigen::internal::TensorCwiseUnaryBlock::TensorCwiseUnaryBlock
TensorCwiseUnaryBlock(const ArgTensorBlock &arg_block, const UnaryOp &functor)
Definition: TensorBlock.h:777

Eigen::internal::TensorCwiseUnaryBlock::m_functor
UnaryOp m_functor
Definition: TensorBlock.h:788

Eigen::internal::TensorCwiseUnaryBlock::cleanup
void cleanup()
Definition: TensorBlock.h:784

Eigen::internal::TensorCwiseUnaryBlock::data
const Scalar * data() const
Definition: TensorBlock.h:783

Eigen::internal::TensorCwiseUnaryBlock::kind
TensorBlockKind kind() const
Definition: TensorBlock.h:780

Eigen::internal::TensorCwiseUnaryBlock::expr
XprType expr() const
Definition: TensorBlock.h:782

Eigen::internal::TensorMaterializedBlock::Storage
Definition: TensorBlock.h:638

Eigen::internal::TensorMaterializedBlock::Storage::m_materialized_in_output
bool m_materialized_in_output
Definition: TensorBlock.h:664

Eigen::internal::TensorMaterializedBlock::Storage::AsTensorMaterializedBlock
TensorMaterializedBlock AsTensorMaterializedBlock() const
Definition: TensorBlock.h:644

Eigen::internal::TensorMaterializedBlock::Storage::m_strided_storage
bool m_strided_storage
Definition: TensorBlock.h:665

Eigen::internal::TensorMaterializedBlock::Storage::m_strides
Dimensions m_strides
Definition: TensorBlock.h:663

Eigen::internal::TensorMaterializedBlock::Storage::m_data
Scalar * m_data
Definition: TensorBlock.h:661

Eigen::internal::TensorMaterializedBlock::Storage::data
Scalar * data() const
Definition: TensorBlock.h:640

Eigen::internal::TensorMaterializedBlock::Storage::m_dimensions
Dimensions m_dimensions
Definition: TensorBlock.h:662

Eigen::internal::TensorMaterializedBlock::Storage::Storage
Storage(Scalar *data, const Dimensions &dimensions, const Dimensions &strides, bool materialized_in_output, bool strided_storage)
Definition: TensorBlock.h:653

Eigen::internal::TensorMaterializedBlock::Storage::strides
const Dimensions & strides() const
Definition: TensorBlock.h:642

Eigen::internal::TensorMaterializedBlock::Storage::dimensions
const Dimensions & dimensions() const
Definition: TensorBlock.h:641

Eigen::internal::TensorMaterializedBlock
Definition: TensorBlock.h:604

Eigen::internal::TensorMaterializedBlock::TensorMaterializedBlock
TensorMaterializedBlock(TensorBlockKind kind, const Scalar *data, const Dimensions &dimensions, bool valid_expr=true)
Definition: TensorBlock.h:609

Eigen::internal::TensorMaterializedBlock::materialize
static EIGEN_STRONG_INLINE TensorMaterializedBlock materialize(const Scalar *data, const DataDimensions &data_dims, TensorBlockDesc &desc, TensorBlockScratch &scratch)
Definition: TensorBlock.h:699

Eigen::internal::TensorMaterializedBlock::data
const Scalar * data() const
Definition: TensorBlock.h:625

Eigen::internal::TensorMaterializedBlock::expr
const XprType & expr() const
Definition: TensorBlock.h:621

Eigen::internal::TensorMaterializedBlock::kind
TensorBlockKind kind() const
Definition: TensorBlock.h:617

Eigen::internal::TensorMaterializedBlock::prepareStorage
static EIGEN_STRONG_INLINE Storage prepareStorage(TensorBlockDesc &desc, TensorBlockScratch &scratch, bool allow_strided_storage=false)
Definition: TensorBlock.h:671

Eigen::internal::TensorMaterializedBlock::m_valid_expr
bool m_valid_expr
Definition: TensorBlock.h:759

Eigen::internal::TensorMaterializedBlock::m_expr
XprType m_expr
Definition: TensorBlock.h:758

Eigen::internal::TensorMaterializedBlock::m_kind
TensorBlockKind m_kind
Definition: TensorBlock.h:755

Eigen::internal::TensorMaterializedBlock::m_data
const Scalar * m_data
Definition: TensorBlock.h:756

Eigen::internal::TensorMaterializedBlock::m_dimensions
Dimensions m_dimensions
Definition: TensorBlock.h:757

Eigen::internal::TensorMaterializedBlock::TensorBlockDesc
internal::TensorBlockDescriptor< NumDims, IndexType > TensorBlockDesc
Definition: TensorBlock.h:628

Eigen::internal::TensorMaterializedBlock::Dimensions
DSizes< IndexType, NumDims > Dimensions
Definition: TensorBlock.h:606

Eigen::internal::TensorMaterializedBlock::XprType
TensorMap< const Tensor< Scalar, NumDims, Layout > > XprType
Definition: TensorBlock.h:607

Eigen::internal::TensorMaterializedBlock::cleanup
void cleanup()
Definition: TensorBlock.h:626

Eigen::internal::TensorTernaryExprBlock
Definition: TensorBlock.h:861

Eigen::internal::TensorTernaryExprBlock::expr
XprType expr() const
Definition: TensorBlock.h:882

Eigen::internal::TensorTernaryExprBlock::kind
TensorBlockKind kind() const
Definition: TensorBlock.h:881

Eigen::internal::TensorTernaryExprBlock::Arg1XprType
Arg1TensorBlock::XprType Arg1XprType
Definition: TensorBlock.h:862

Eigen::internal::TensorTernaryExprBlock::TensorTernaryExprBlock
TensorTernaryExprBlock(const Arg1TensorBlock &arg1_block, const Arg2TensorBlock &arg2_block, const Arg3TensorBlock &arg3_block, const BlockFactory &factory)
Definition: TensorBlock.h:877

Eigen::internal::TensorTernaryExprBlock::XprType
std::conditional_t< NoArgBlockAccess, void, typename BlockFactory::template XprType< Arg1XprType, Arg2XprType, Arg3XprType >::type > XprType
Definition: TensorBlock.h:873

Eigen::internal::TensorTernaryExprBlock::data
const Scalar * data() const
Definition: TensorBlock.h:883

Eigen::internal::TensorTernaryExprBlock::Arg3XprType
Arg3TensorBlock::XprType Arg3XprType
Definition: TensorBlock.h:864

Eigen::internal::TensorTernaryExprBlock::m_factory
BlockFactory m_factory
Definition: TensorBlock.h:894

Eigen::internal::TensorTernaryExprBlock::m_arg3_block
Arg3TensorBlock m_arg3_block
Definition: TensorBlock.h:893

Eigen::internal::TensorTernaryExprBlock::NoArgBlockAccess
static constexpr bool NoArgBlockAccess
Definition: TensorBlock.h:866

Eigen::internal::TensorTernaryExprBlock::m_arg1_block
Arg1TensorBlock m_arg1_block
Definition: TensorBlock.h:891

Eigen::internal::TensorTernaryExprBlock::m_arg2_block
Arg2TensorBlock m_arg2_block
Definition: TensorBlock.h:892

Eigen::internal::TensorTernaryExprBlock::Scalar
XprScalar< XprType >::type Scalar
Definition: TensorBlock.h:875

Eigen::internal::TensorTernaryExprBlock::cleanup
void cleanup()
Definition: TensorBlock.h:884

Eigen::internal::TensorTernaryExprBlock::Arg2XprType
Arg2TensorBlock::XprType Arg2XprType
Definition: TensorBlock.h:863

Eigen::internal::TensorUnaryExprBlock
Definition: TensorBlock.h:834

Eigen::internal::TensorUnaryExprBlock::TensorUnaryExprBlock
TensorUnaryExprBlock(const ArgTensorBlock &arg_block, const BlockFactory &factory)
Definition: TensorBlock.h:843

Eigen::internal::TensorUnaryExprBlock::expr
XprType expr() const
Definition: TensorBlock.h:847

Eigen::internal::TensorUnaryExprBlock::Scalar
XprScalar< XprType >::type Scalar
Definition: TensorBlock.h:841

Eigen::internal::TensorUnaryExprBlock::cleanup
void cleanup()
Definition: TensorBlock.h:849

Eigen::internal::TensorUnaryExprBlock::ArgXprType
ArgTensorBlock::XprType ArgXprType
Definition: TensorBlock.h:835

Eigen::internal::TensorUnaryExprBlock::NoArgBlockAccess
static constexpr bool NoArgBlockAccess
Definition: TensorBlock.h:836

Eigen::internal::TensorUnaryExprBlock::data
const Scalar * data() const
Definition: TensorBlock.h:848

Eigen::internal::TensorUnaryExprBlock::XprType
std::conditional_t< NoArgBlockAccess, void, typename BlockFactory::template XprType< ArgXprType >::type > XprType
Definition: TensorBlock.h:839

Eigen::internal::TensorUnaryExprBlock::m_factory
BlockFactory m_factory
Definition: TensorBlock.h:853

Eigen::internal::TensorUnaryExprBlock::m_arg_block
ArgTensorBlock m_arg_block
Definition: TensorBlock.h:852

Eigen::internal::TensorUnaryExprBlock::kind
TensorBlockKind kind() const
Definition: TensorBlock.h:846

sizes
std::vector< Array2i > sizes
Definition: dense_solvers.cpp:12

Eigen::ColMajor
@ ColMajor
Definition: Constants.h:318

s
RealScalar s
Definition: level1_cplx_impl.h:130

Eigen::Architecture::Target
@ Target
Definition: Constants.h:495

Eigen::bfloat16_impl::pow
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 pow(const bfloat16 &a, const bfloat16 &b)
Definition: BFloat16.h:625

Eigen::internal::TensorBlockKind
TensorBlockKind
Definition: TensorBlock.h:538

Eigen::internal::kMaterializedInOutput
@ kMaterializedInOutput
Definition: TensorBlock.h:559

Eigen::internal::kMaterializedInScratch
@ kMaterializedInScratch
Definition: TensorBlock.h:550

Eigen::internal::kView
@ kView
Definition: TensorBlock.h:545

Eigen::internal::kExpr
@ kExpr
Definition: TensorBlock.h:541

Eigen::internal::array_prod
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
Definition: MoreMeta.h:497

Eigen::internal::strides
EIGEN_ALWAYS_INLINE DSizes< IndexType, NumDims > strides(const DSizes< IndexType, NumDims > &dimensions)
Definition: TensorBlock.h:29

Eigen::internal::TensorBlockShapeType
TensorBlockShapeType
Definition: TensorBlock.h:73

Eigen::internal::TensorBlockShapeType::kUniformAllDims
@ kUniformAllDims

Eigen::internal::TensorBlockShapeType::kSkewedInnerDims
@ kSkewedInnerDims

Eigen::numext::maxi
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
Definition: MathFunctions.h:926

Eigen::numext::div_ceil
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T div_ceil(T a, T b)
Definition: MathFunctions.h:1251

Eigen::numext::mini
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition: MathFunctions.h:920

Eigen
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70

Eigen::dimensions_match
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2)
Definition: TensorDimensions.h:322

Eigen::array
std::array< T, N > array
Definition: EmulateArray.h:231

Eigen::Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83

compute_granudrum_aor.type
type
Definition: compute_granudrum_aor.py:141

internal
Definition: Eigen_Colamd.h:49

XprType
CwiseBinaryOp< internal::scalar_sum_op< double, double >, const CpyMatrixXd, const CpyMatrixXd > XprType
Definition: nestbyvalue.cpp:15

eval
internal::nested_eval< T, 1 >::type eval(const T &xpr)
Definition: sparse_permutations.cpp:47

Eigen::DSizes< IndexType, NumDims >

Eigen::DSizes::TotalSize
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const
Definition: TensorDimensions.h:167

Eigen::DSizes::rank
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const
Definition: TensorDimensions.h:165

Eigen::DefaultDevice
Definition: TensorDeviceDefault.h:19

Eigen::Sizes
Definition: TensorDimensions.h:85

Eigen::TensorEvaluator
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31

Eigen::TensorEvaluator::Layout
static constexpr int Layout
Definition: TensorEvaluator.h:46

Eigen::internal::StridedLinearBufferCopy::Dst
Definition: TensorBlock.h:923

Eigen::internal::StridedLinearBufferCopy::Dst::Dst
Dst(IndexType o, IndexType s, Scalar *d)
Definition: TensorBlock.h:924

Eigen::internal::StridedLinearBufferCopy::Dst::offset
IndexType offset
Definition: TensorBlock.h:926

Eigen::internal::StridedLinearBufferCopy::Dst::data
Scalar * data
Definition: TensorBlock.h:928

Eigen::internal::StridedLinearBufferCopy::Dst::stride
IndexType stride
Definition: TensorBlock.h:927

Eigen::internal::StridedLinearBufferCopy::Src
Definition: TensorBlock.h:931

Eigen::internal::StridedLinearBufferCopy::Src::data
const Scalar * data
Definition: TensorBlock.h:936

Eigen::internal::StridedLinearBufferCopy::Src::Src
Src(IndexType o, IndexType s, const Scalar *d)
Definition: TensorBlock.h:932

Eigen::internal::StridedLinearBufferCopy::Src::offset
IndexType offset
Definition: TensorBlock.h:934

Eigen::internal::StridedLinearBufferCopy::Src::stride
IndexType stride
Definition: TensorBlock.h:935

Eigen::internal::TensorBlockAssignment::BlockIteratorState
Definition: TensorBlock.h:1459

Eigen::internal::TensorBlockAssignment::BlockIteratorState::output_span
IndexType output_span
Definition: TensorBlock.h:1465

Eigen::internal::TensorBlockAssignment::BlockIteratorState::size
IndexType size
Definition: TensorBlock.h:1463

Eigen::internal::TensorBlockAssignment::BlockIteratorState::output_stride
IndexType output_stride
Definition: TensorBlock.h:1464

Eigen::internal::TensorBlockAssignment::BlockIteratorState::count
IndexType count
Definition: TensorBlock.h:1462

Eigen::internal::TensorBlockAssignment::BlockIteratorState::BlockIteratorState
BlockIteratorState()
Definition: TensorBlock.h:1460

Eigen::internal::TensorBlockAssignment::InnerDimAssign< true, Evaluator >::Run
static EIGEN_ALWAYS_INLINE void Run(Scalar *target, IndexType count, const Evaluator &eval, IndexType eval_offset)
Definition: TensorBlock.h:1333

Eigen::internal::TensorBlockAssignment::InnerDimAssign
Definition: TensorBlock.h:1323

Eigen::internal::TensorBlockAssignment::InnerDimAssign::Run
static EIGEN_ALWAYS_INLINE void Run(Scalar *target, IndexType count, const Evaluator &eval, IndexType eval_offset)
Definition: TensorBlock.h:1324

Eigen::internal::TensorBlockAssignment::Target
Definition: TensorBlock.h:1360

Eigen::internal::TensorBlockAssignment::Target::strides
Dimensions strides
Definition: TensorBlock.h:1366

Eigen::internal::TensorBlockAssignment::Target::dims
Dimensions dims
Definition: TensorBlock.h:1365

Eigen::internal::TensorBlockAssignment::Target::data
Scalar * data
Definition: TensorBlock.h:1367

Eigen::internal::TensorBlockAssignment::Target::Target
Target(const Dimensions &target_dims, const Dimensions &target_strides, Scalar *target_data, IndexType target_offset=0)
Definition: TensorBlock.h:1361

Eigen::internal::TensorBlockAssignment::Target::offset
IndexType offset
Definition: TensorBlock.h:1368

Eigen::internal::TensorBlockIO::BlockIteratorState
Definition: TensorBlock.h:1269

Eigen::internal::TensorBlockIO::BlockIteratorState::input_stride
IndexType input_stride
Definition: TensorBlock.h:1274

Eigen::internal::TensorBlockIO::BlockIteratorState::count
IndexType count
Definition: TensorBlock.h:1273

Eigen::internal::TensorBlockIO::BlockIteratorState::size
IndexType size
Definition: TensorBlock.h:1272

Eigen::internal::TensorBlockIO::BlockIteratorState::output_span
IndexType output_span
Definition: TensorBlock.h:1277

Eigen::internal::TensorBlockIO::BlockIteratorState::BlockIteratorState
BlockIteratorState()
Definition: TensorBlock.h:1270

Eigen::internal::TensorBlockIO::BlockIteratorState::output_stride
IndexType output_stride
Definition: TensorBlock.h:1275

Eigen::internal::TensorBlockIO::BlockIteratorState::input_span
IndexType input_span
Definition: TensorBlock.h:1276

Eigen::internal::TensorBlockIO::Dst
Definition: TensorBlock.h:1102

Eigen::internal::TensorBlockIO::Dst::offset
IndexType offset
Definition: TensorBlock.h:1109

Eigen::internal::TensorBlockIO::Dst::strides
Dimensions strides
Definition: TensorBlock.h:1107

Eigen::internal::TensorBlockIO::Dst::dims
Dimensions dims
Definition: TensorBlock.h:1106

Eigen::internal::TensorBlockIO::Dst::data
Scalar * data
Definition: TensorBlock.h:1108

Eigen::internal::TensorBlockIO::Dst::Dst
Dst(const Dimensions &dst_dims, const Dimensions &dst_strides, Scalar *dst, IndexType dst_offset=0)
Definition: TensorBlock.h:1103

Eigen::internal::TensorBlockIO::Src
Definition: TensorBlock.h:1112

Eigen::internal::TensorBlockIO::Src::data
const Scalar * data
Definition: TensorBlock.h:1117

Eigen::internal::TensorBlockIO::Src::strides
Dimensions strides
Definition: TensorBlock.h:1116

Eigen::internal::TensorBlockIO::Src::offset
IndexType offset
Definition: TensorBlock.h:1118

Eigen::internal::TensorBlockIO::Src::Src
Src(const Dimensions &src_strides, const Scalar *src, IndexType src_offset=0)
Definition: TensorBlock.h:1113

Eigen::internal::TensorBlockResourceRequirements
Definition: TensorBlock.h:75

Eigen::internal::TensorBlockResourceRequirements::addCostPerCoeff
EIGEN_DEVICE_FUNC TensorBlockResourceRequirements & addCostPerCoeff(TensorOpCost cost)
Definition: TensorBlock.h:135

Eigen::internal::TensorBlockResourceRequirements::withShapeAndSize
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type, size_t size_in_bytes)
Definition: TensorBlock.h:96

Eigen::internal::TensorBlockResourceRequirements::withShapeAndSize
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type, size_t size_in_bytes, TensorOpCost cost)
Definition: TensorBlock.h:89

Eigen::internal::TensorBlockResourceRequirements::skewed
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements skewed(size_t size_in_bytes)
Definition: TensorBlock.h:119

Eigen::internal::TensorBlockResourceRequirements::merge
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockShapeType merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs)
Definition: TensorBlock.h:154

Eigen::internal::TensorBlockResourceRequirements::merge
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size)
Definition: TensorBlock.h:150

Eigen::internal::TensorBlockResourceRequirements::any
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockResourceRequirements any()
Definition: TensorBlock.h:143

Eigen::internal::TensorBlockResourceRequirements::size
size_t size
Definition: TensorBlock.h:77

Eigen::internal::TensorBlockResourceRequirements::merge
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockResourceRequirements merge(const TensorBlockResourceRequirements &lhs, const TensorBlockResourceRequirements &rhs)
Definition: TensorBlock.h:129

Eigen::internal::TensorBlockResourceRequirements::uniform
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements uniform(size_t size_in_bytes)
Definition: TensorBlock.h:124

Eigen::internal::TensorBlockResourceRequirements::merge
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, TensorOpCost rhs_cost)
Definition: TensorBlock.h:161

Eigen::internal::TensorBlockResourceRequirements::cost_per_coeff
TensorOpCost cost_per_coeff
Definition: TensorBlock.h:78

Eigen::internal::TensorBlockResourceRequirements::shape_type
TensorBlockShapeType shape_type
Definition: TensorBlock.h:76

Eigen::internal::TensorBlockScratchAllocator::Allocation
Definition: TensorBlock.h:524

Eigen::internal::TensorBlockScratchAllocator::Allocation::size
size_t size
Definition: TensorBlock.h:526

Eigen::internal::TensorBlockScratchAllocator::Allocation::ptr
void * ptr
Definition: TensorBlock.h:525

Eigen::internal::XprScalar< void >::type
void type
Definition: TensorBlock.h:582

Eigen::internal::XprScalar
Definition: TensorBlock.h:577

Eigen::internal::XprScalar::type
XprType::Scalar type
Definition: TensorBlock.h:578

Eigen::internal::array_size
Definition: Meta.h:305

Eigen::internal::eval
Definition: XprHelper.h:427

Eigen::internal::is_void
Definition: Meta.h:214

Eigen::internal::packet_traits
Definition: GenericPacketMath.h:108

Eigen::internal::unpacket_traits
Definition: GenericPacketMath.h:134

j
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2

Eigen::internal::Packet
Definition: ZVector/PacketMath.h:50

InternalHeaderCheck.h