dc/d5c/TensorConvolution_8h_source.html

 // This file is part of Eigen, a lightweight C++ template library

 // for linear algebra.

 //

 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>

 //

 // This Source Code Form is subject to the terms of the Mozilla

 // Public License v. 2.0. If a copy of the MPL was not distributed

 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.


 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H

 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H


 // IWYU pragma: private

 #include "./InternalHeaderCheck.h"


 namespace Eigen {


 namespace internal {


 template <typename Index, typename InputDims, int NumKernelDims, int Layout>

 class IndexMapper {

  public:

   IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,

               const array<Index, NumKernelDims>& indices) {

     array<Index, NumDims> dimensions = input_dims;

     for (int i = 0; i < NumKernelDims; ++i) {

       const Index index = indices[i];

       const Index input_dim = input_dims[index];

       const Index kernel_dim = kernel_dims[i];

       const Index result_dim = input_dim - kernel_dim + 1;

       dimensions[index] = result_dim;

     }


     array<Index, NumDims> inputStrides;

     array<Index, NumDims> outputStrides;

     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {

       inputStrides[0] = 1;

       outputStrides[0] = 1;

       for (int i = 1; i < NumDims; ++i) {

         inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1];

         outputStrides[i] = outputStrides[i - 1] * dimensions[i - 1];

       }

     } else {

       inputStrides[NumDims - 1] = 1;

       outputStrides[NumDims - 1] = 1;

       for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {

         inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];

         outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];

       }

     }


     array<Index, NumDims> gpuInputDimensions;

     array<Index, NumDims> gpuOutputDimensions;

     array<Index, NumDims> tmp = dimensions;

     array<Index, NumDims> ordering;

     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;

     for (int i = 0; i < NumKernelDims; ++i) {

       const Index index = i + offset;

       ordering[index] = indices[i];

       tmp[indices[i]] = -1;

       gpuInputDimensions[index] = input_dims[indices[i]];

       gpuOutputDimensions[index] = dimensions[indices[i]];

     }


     int written = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? NumKernelDims : 0;

     for (int i = 0; i < NumDims; ++i) {

       if (tmp[i] >= 0) {

         ordering[written] = i;

         gpuInputDimensions[written] = input_dims[i];

         gpuOutputDimensions[written] = dimensions[i];

         ++written;

       }

     }


     for (int i = 0; i < NumDims; ++i) {

       m_inputStrides[i] = inputStrides[ordering[i]];

       m_outputStrides[i] = outputStrides[ordering[i]];

     }


     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {

       for (int i = 0; i < NumDims; ++i) {

         if (i > NumKernelDims) {

           m_gpuInputStrides[i] = m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];

           m_gpuOutputStrides[i] = m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];

         } else {

           m_gpuInputStrides[i] = 1;

           m_gpuOutputStrides[i] = 1;

         }

       }

     } else {

       for (int i = NumDims - 1; i >= 0; --i) {

         if (static_cast<size_t>(i + 1) < offset) {

           m_gpuInputStrides[i] = m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];

           m_gpuOutputStrides[i] = m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];

         } else {

           m_gpuInputStrides[i] = 1;

           m_gpuOutputStrides[i] = 1;

         }

       }

     }

   }


   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {

     Index inputIndex = 0;

     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {

       for (int d = NumDims - 1; d > NumKernelDims; --d) {

         const Index idx = p / m_gpuInputStrides[d];

         inputIndex += idx * m_inputStrides[d];

         p -= idx * m_gpuInputStrides[d];

       }

       if (NumKernelDims < NumDims) {

         inputIndex += p * m_inputStrides[NumKernelDims];

       }

     } else {

       std::ptrdiff_t limit = 0;

       if (NumKernelDims < NumDims) {

         limit = NumDims - NumKernelDims - 1;

       }

       for (int d = 0; d < limit; ++d) {

         const Index idx = p / m_gpuInputStrides[d];

         inputIndex += idx * m_inputStrides[d];

         p -= idx * m_gpuInputStrides[d];

       }

       inputIndex += p * m_inputStrides[limit];

     }

     return inputIndex;

   }


   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {

     Index outputIndex = 0;

     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {

       for (int d = NumDims - 1; d > NumKernelDims; --d) {

         const Index idx = p / m_gpuOutputStrides[d];

         outputIndex += idx * m_outputStrides[d];

         p -= idx * m_gpuOutputStrides[d];

       }

       if (NumKernelDims < NumDims) {

         outputIndex += p * m_outputStrides[NumKernelDims];

       }

     } else {

       std::ptrdiff_t limit = 0;

       if (NumKernelDims < NumDims) {

         limit = NumDims - NumKernelDims - 1;

       }

       for (int d = 0; d < limit; ++d) {

         const Index idx = p / m_gpuOutputStrides[d];

         outputIndex += idx * m_outputStrides[d];

         p -= idx * m_gpuOutputStrides[d];

       }

       outputIndex += p * m_outputStrides[limit];

     }

     return outputIndex;

   }


   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {

     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;

     return i * m_inputStrides[offset];

   }


   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {

     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;

     return i * m_outputStrides[offset];

   }


   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {

     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;

     return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];

   }


   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {

     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;

     return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];

   }


   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {

     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;

     return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + k * m_inputStrides[offset + 2];

   }


   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {

     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;

     return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + k * m_outputStrides[offset + 2];

   }


  private:

   static constexpr int NumDims = internal::array_size<InputDims>::value;

   array<Index, NumDims> m_inputStrides;

   array<Index, NumDims> m_outputStrides;

   array<Index, NumDims> m_gpuInputStrides;

   array<Index, NumDims> m_gpuOutputStrides;

 };


 template <typename Dimensions, typename InputXprType, typename KernelXprType>

 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> > {

   // Type promotion to handle the case where the types of the lhs and the rhs are different.

   typedef typename promote_storage_type<typename InputXprType::Scalar, typename KernelXprType::Scalar>::ret Scalar;

   typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,

                                         typename traits<KernelXprType>::StorageKind>::ret StorageKind;

   typedef typename promote_index_type<typename traits<InputXprType>::Index, typename traits<KernelXprType>::Index>::type

       Index;

   typedef typename InputXprType::Nested LhsNested;

   typedef typename KernelXprType::Nested RhsNested;

   typedef std::remove_reference_t<LhsNested> LhsNested_;

   typedef std::remove_reference_t<RhsNested> RhsNested_;

   static constexpr int NumDimensions = traits<InputXprType>::NumDimensions;

   static constexpr int Layout = traits<InputXprType>::Layout;

   typedef std::conditional_t<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,

                              typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>

       PointerType;


   enum { Flags = 0 };

 };


 template <typename Dimensions, typename InputXprType, typename KernelXprType>

 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense> {

   typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;

 };


 template <typename Dimensions, typename InputXprType, typename KernelXprType>

 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1,

               typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type> {

   typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;

 };


 }  // end namespace internal


 template <typename Indices, typename InputXprType, typename KernelXprType>

 class TensorConvolutionOp

     : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors> {

  public:

   typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;

   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;

   typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,

                                                   typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;

   typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;

   typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;

   typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel,

                                                             const Indices& dims)

       : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices& indices() const { return m_indices; }


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<typename InputXprType::Nested>& inputExpression()

       const {

     return m_input_xpr;

   }


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<typename KernelXprType::Nested>& kernelExpression()

       const {

     return m_kernel_xpr;

   }


  protected:

   typename InputXprType::Nested m_input_xpr;

   typename KernelXprType::Nested m_kernel_xpr;

   const Indices m_indices;

 };


 template <typename Indices, typename InputArgType, typename KernelArgType, typename Device>

 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device> {

   typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;


   static constexpr int NumDims =

       internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;

   static constexpr int NumKernelDims = internal::array_size<Indices>::value;

   typedef typename XprType::Index Index;

   typedef DSizes<Index, NumDims> Dimensions;


   typedef typename XprType::Scalar Scalar;

   typedef typename XprType::CoeffReturnType CoeffReturnType;

   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;

   static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;

   typedef StorageMemory<Scalar, Device> Storage;

   typedef typename Storage::Type EvaluatorPointerType;


   static constexpr int Layout = TensorEvaluator<InputArgType, Device>::Layout;

   enum {

     IsAligned =

         int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),

     PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) &

                    int(TensorEvaluator<KernelArgType, Device>::PacketAccess),

     BlockAccess = false,

     PreferBlockAccess = false,

     CoordAccess = false,  // to be implemented

     RawAccess = false

   };


   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//

   typedef internal::TensorBlockNotImplemented TensorBlock;

   //===--------------------------------------------------------------------===//


   EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)

       : m_inputImpl(op.inputExpression(), device),

         m_kernelImpl(op.kernelExpression(), device),

         m_kernelArg(op.kernelExpression()),

         m_kernel(NULL),

         m_local_kernel(false),

         m_device(device) {

     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) ==

                          static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)),

                         YOU_MADE_A_PROGRAMMING_MISTAKE);


     const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();

     const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();


     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {

       m_inputStride[0] = 1;

       for (int i = 1; i < NumDims; ++i) {

         m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];

       }

     } else {

       m_inputStride[NumDims - 1] = 1;

       for (int i = NumDims - 2; i >= 0; --i) {

         m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];

       }

     }


     m_dimensions = m_inputImpl.dimensions();

     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {

       for (int i = 0; i < NumKernelDims; ++i) {

         const Index index = op.indices()[i];

         const Index input_dim = input_dims[index];

         const Index kernel_dim = kernel_dims[i];

         const Index result_dim = input_dim - kernel_dim + 1;

         m_dimensions[index] = result_dim;

         if (i > 0) {

           m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];

         } else {

           m_kernelStride[0] = 1;

         }

         m_indexStride[i] = m_inputStride[index];

       }


       m_outputStride[0] = 1;

       for (int i = 1; i < NumDims; ++i) {

         m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];

       }

     } else {

       for (int i = NumKernelDims - 1; i >= 0; --i) {

         const Index index = op.indices()[i];

         const Index input_dim = input_dims[index];

         const Index kernel_dim = kernel_dims[i];

         const Index result_dim = input_dim - kernel_dim + 1;

         m_dimensions[index] = result_dim;

         if (i < NumKernelDims - 1) {

           m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];

         } else {

           m_kernelStride[NumKernelDims - 1] = 1;

         }

         m_indexStride[i] = m_inputStride[index];

       }


       m_outputStride[NumDims - 1] = 1;

       for (int i = NumDims - 2; i >= 0; --i) {

         m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];

       }

     }

   }


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {

     m_inputImpl.evalSubExprsIfNeeded(NULL);

     preloadKernel();

     return true;

   }

   EIGEN_STRONG_INLINE void cleanup() {

     m_inputImpl.cleanup();

     if (m_local_kernel) {

       m_device.deallocate((void*)m_kernel);

       m_local_kernel = false;

     }

     m_kernel = NULL;

   }


   void evalTo(typename XprType::Scalar* buffer) {

     evalSubExprsIfNeeded(NULL);

     for (int i = 0; i < dimensions().TotalSize(); ++i) {

       buffer[i] += coeff(i);

     }

     cleanup();

   }


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {

     CoeffReturnType result = CoeffReturnType(0);

     convolve(firstInput(index), 0, NumKernelDims - 1, result);

     return result;

   }


   template <int LoadMode>

   EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const {

     Index indices[2] = {index, index + PacketSize - 1};

     Index startInputs[2] = {0, 0};

     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {

       for (int i = NumDims - 1; i > 0; --i) {

         const Index idx0 = indices[0] / m_outputStride[i];

         const Index idx1 = indices[1] / m_outputStride[i];

         startInputs[0] += idx0 * m_inputStride[i];

         startInputs[1] += idx1 * m_inputStride[i];

         indices[0] -= idx0 * m_outputStride[i];

         indices[1] -= idx1 * m_outputStride[i];

       }

     } else {

       for (int i = 0; i < NumDims - 1; ++i) {

         const Index idx0 = indices[0] / m_outputStride[i];

         const Index idx1 = indices[1] / m_outputStride[i];

         startInputs[0] += idx0 * m_inputStride[i];

         startInputs[1] += idx1 * m_inputStride[i];

         indices[0] -= idx0 * m_outputStride[i];

         indices[1] -= idx1 * m_outputStride[i];

       }

     }

     startInputs[0] += indices[0];

     startInputs[1] += indices[1];


     if (startInputs[1] - startInputs[0] == PacketSize - 1) {

       PacketReturnType result = internal::pset1<PacketReturnType>(0);

       convolvePacket(startInputs[0], 0, NumKernelDims - 1, result);

       return result;

     } else {

       EIGEN_ALIGN_MAX Scalar data[PacketSize];

       data[0] = Scalar(0);

       convolve(startInputs[0], 0, NumKernelDims - 1, data[0]);

       for (int i = 1; i < PacketSize - 1; ++i) {

         data[i] = Scalar(0);

         convolve(firstInput(index + i), 0, NumKernelDims - 1, data[i]);

       }

       data[PacketSize - 1] = Scalar(0);

       convolve(startInputs[1], 0, NumKernelDims - 1, data[PacketSize - 1]);

       return internal::pload<PacketReturnType>(data);

     }

   }


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {

     const double kernel_size = m_kernelImpl.dimensions().TotalSize();

     // We ignore the use of fused multiply-add.

     const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();

     const double firstIndex_compute_cost =

         NumDims *

         (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());

     return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +

            kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +

                           TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));

   }


   EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }


  private:

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {

     Index startInput = 0;

     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {

       for (int i = NumDims - 1; i > 0; --i) {

         const Index idx = index / m_outputStride[i];

         startInput += idx * m_inputStride[i];

         index -= idx * m_outputStride[i];

       }

     } else {

       for (int i = 0; i < NumDims - 1; ++i) {

         const Index idx = index / m_outputStride[i];

         startInput += idx * m_inputStride[i];

         index -= idx * m_outputStride[i];

       }

     }

     startInput += index;

     return startInput;

   }


   EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {

     for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {

       const Index input = firstIndex + j * m_indexStride[DimIndex];

       const Index kernel = firstKernel + j * m_kernelStride[DimIndex];

       if (DimIndex > 0) {

         convolve(input, kernel, DimIndex - 1, accum);

       } else {

         accum += m_inputImpl.coeff(input) * m_kernel[kernel];

       }

     }

   }


   template <typename Packet>

   EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {

     for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {

       const Index input = firstIndex + j * m_indexStride[DimIndex];

       const Index kernel = firstKernel + j * m_kernelStride[DimIndex];

       if (DimIndex > 0) {

         convolvePacket(input, kernel, DimIndex - 1, accum);

       } else {

         accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input),

                                         internal::pset1<Packet>(m_kernel[kernel]), accum);

       }

     }

   }


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {

     // Don't make a local copy of the kernel unless we have to (i.e. it's an

     // expression that needs to be evaluated)

     const Scalar* in_place = m_kernelImpl.data();

     if (in_place) {

       m_kernel = in_place;

       m_local_kernel = false;

     } else {

       size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);

       Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);

       typedef TensorEvalToOp<const KernelArgType> EvalTo;

       EvalTo evalToTmp(local, m_kernelArg);

       const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;

       internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);


       m_kernel = local;

       m_local_kernel = true;

     }

   }


   array<Index, NumDims> m_inputStride;

   array<Index, NumDims> m_outputStride;


   array<Index, NumKernelDims> m_indexStride;

   array<Index, NumKernelDims> m_kernelStride;

   TensorEvaluator<InputArgType, Device> m_inputImpl;

   TensorEvaluator<KernelArgType, Device> m_kernelImpl;

   Dimensions m_dimensions;


   KernelArgType m_kernelArg;

   const Scalar* m_kernel;

   bool m_local_kernel;

   const Device EIGEN_DEVICE_REF m_device;

 };


 // Use an optimized implementation of the evaluation code for GPUs whenever possible.

 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)


 template <int StaticKernelSize>

 struct GetKernelSize {

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int /*kernelSize*/) const { return StaticKernelSize; }

 };

 template <>

 struct GetKernelSize<Dynamic> {

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int kernelSize) const { return kernelSize; }

 };


 template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize>

 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(

     InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> indexMapper,

     const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize,

     float* buffer) {

 #if defined(EIGEN_HIPCC)

   HIP_DYNAMIC_SHARED(float, s)

 #else

   extern __shared__ float s[];

 #endif


   const int first_x = blockIdx.x * maxX;

   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;

   const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);

   const int num_x_output = last_x - first_x + 1;


   const int first_plane = blockIdx.y * blockDim.y;

   const int plane_stride = blockDim.y * gridDim.y;


   for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {

     // Load inputs to shared memory

     const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);

     const int plane_kernel_offset = threadIdx.y * num_x_input;

 #pragma unroll

     for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {

       const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x);

       s[i + plane_kernel_offset] = eval.coeff(tensor_index);

     }


     __syncthreads();


     // Compute the convolution

     const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);


 #pragma unroll

     for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {

       const int kernel_offset = plane_kernel_offset + i;

       float result = 0.0f;

 #pragma unroll

       for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {

         result += s[k + kernel_offset] * kernel[k];

       }

       const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x);

       buffer[tensor_index] = result;

     }

     __syncthreads();

   }

 };


 template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY>

 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(

     InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> indexMapper,

     const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY,

     const int kernelSizeX, const int kernelSizeY, float* buffer) {

 #if defined(EIGEN_HIPCC)

   HIP_DYNAMIC_SHARED(float, s)

 #else

   extern __shared__ float s[];

 #endif


   const int first_x = blockIdx.x * maxX;

   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;

   const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);

   const int num_x_output = last_x - first_x + 1;


   const int first_y = blockIdx.y * maxY;

   const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;

   const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);

   const int num_y_output = last_y - first_y + 1;


   const int first_plane = blockIdx.z * blockDim.z;

   const int plane_stride = blockDim.z * gridDim.z;


   for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {

     const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);

     const int plane_kernel_offset = threadIdx.z * num_y_input;


 // Load inputs to shared memory

 #pragma unroll

     for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {

       const int input_offset = num_x_input * (j + plane_kernel_offset);

 #pragma unroll

       for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {

         const int tensor_index =

             plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x, j + first_y);

         s[i + input_offset] = eval.coeff(tensor_index);

       }

     }


     __syncthreads();


     // Convolution

     const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);


 #pragma unroll

     for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {

 #pragma unroll

       for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {

         float result = 0.0f;

 #pragma unroll

         for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {

           const int kernel_offset = kernelSizeX * l;

           const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);

 #pragma unroll

           for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {

             result += s[k + input_offset] * kernel[k + kernel_offset];

           }

         }

         const int tensor_index =

             plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x, j + first_y);

         buffer[tensor_index] = result;

       }

     }


     __syncthreads();

   }

 };


 template <typename InputEvaluator, typename Index, typename InputDims>

 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(

     InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> indexMapper,

     const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY,

     const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,

     const size_t kernelSizeZ, float* buffer) {

 #if defined(EIGEN_HIPCC)

   HIP_DYNAMIC_SHARED(float, s)

 #else

   extern __shared__ float s[];

 #endif


   // Load inputs to shared memory

   const int first_x = blockIdx.x * maxX;

   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;

   const int num_x_input = last_x - first_x + kernelSizeX;


   const int first_y = blockIdx.y * maxY;

   const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;

   const int num_y_input = last_y - first_y + kernelSizeY;


   const int first_z = blockIdx.z * maxZ;

   const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;

   const int num_z_input = last_z - first_z + kernelSizeZ;


   for (int p = 0; p < numPlanes; ++p) {

     const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);

     const int plane_kernel_offset = 0;


     for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {

       for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {

         for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {

           const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(

                                                             i + first_x, j + first_y, k + first_z);

           s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);

         }

       }

     }


     __syncthreads();


     // Convolution

     const int num_z_output = last_z - first_z + 1;

     const int num_y_output = last_y - first_y + 1;

     const int num_x_output = last_x - first_x + 1;

     const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);


     for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {

       for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {

         for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {

           float result = 0.0f;

           for (int n = 0; n < kernelSizeZ; ++n) {

             for (int m = 0; m < kernelSizeY; ++m) {

               for (int l = 0; l < kernelSizeX; ++l) {

                 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] *

                           kernel[l + kernelSizeX * (m + kernelSizeY * n)];

               }

             }

           }

           const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(

                                                              i + first_x, j + first_y, k + first_z);

           buffer[tensor_index] = result;

         }

       }

     }

     __syncthreads();

   }

 };


 template <typename Indices, typename InputArgType, typename KernelArgType>

 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice> {

   typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;


   static constexpr int NumDims =

       internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;

   static constexpr int NumKernelDims = internal::array_size<Indices>::value;

   typedef typename XprType::Index Index;

   typedef DSizes<Index, NumDims> Dimensions;

   typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;


   static constexpr int Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout;

   enum {

     IsAligned =

         TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,

     PacketAccess = false,

     BlockAccess = false,

     PreferBlockAccess = false,

     CoordAccess = false,  // to be implemented

     RawAccess = false

   };


   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//

   typedef internal::TensorBlockNotImplemented TensorBlock;

   //===--------------------------------------------------------------------===//


   TensorEvaluator(const XprType& op, const GpuDevice& device)

       : m_inputImpl(op.inputExpression(), device),

         m_kernelImpl(op.kernelExpression(), device),

         m_kernelArg(op.kernelExpression()),

         m_indices(op.indices()),

         m_buf(NULL),

         m_kernel(NULL),

         m_local_kernel(false),

         m_device(device) {

     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) ==

                          static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)),

                         YOU_MADE_A_PROGRAMMING_MISTAKE);


     const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();

     const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();


     m_dimensions = m_inputImpl.dimensions();

     for (int i = 0; i < NumKernelDims; ++i) {

       const Index index = op.indices()[i];

       const Index input_dim = input_dims[index];

       const Index kernel_dim = kernel_dims[i];

       const Index result_dim = input_dim - kernel_dim + 1;

       m_dimensions[index] = result_dim;

     }

   }


   typedef typename XprType::CoeffReturnType CoeffReturnType;

   typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;

   typedef typename InputArgType::Scalar Scalar;

   static constexpr int PacketSize = internal::unpacket_traits<PacketReturnType>::size;


   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {

     preloadKernel();

     m_inputImpl.evalSubExprsIfNeeded(NULL);

     if (data) {

       executeEval(data);

       return false;

     } else {

       m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));

       executeEval(m_buf);

       return true;

     }

   }


   EIGEN_STRONG_INLINE void cleanup() {

     m_inputImpl.cleanup();

     if (m_buf) {

       m_device.deallocate(m_buf);

       m_buf = NULL;

     }

     if (m_local_kernel) {

       m_device.deallocate((void*)m_kernel);

       m_local_kernel = false;

     }

     m_kernel = NULL;

   }


   EIGEN_STRONG_INLINE void preloadKernel() {

     // Don't make a local copy of the kernel unless we have to (i.e. it's an

     // expression that needs to be evaluated)

     const Scalar* in_place = m_kernelImpl.data();

     if (in_place) {

       m_kernel = in_place;

       m_local_kernel = false;

     } else {

       size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);

       Scalar* local = (Scalar*)m_device.allocate(kernel_sz);

       typedef TensorEvalToOp<const KernelArgType> EvalTo;

       EvalTo evalToTmp(local, m_kernelArg);

       const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;

       internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);


       m_kernel = local;

       m_local_kernel = true;

     }

   }


   static unsigned int ceil(unsigned int num, unsigned int denom) {

     const unsigned int rounded_toward_zero = num / denom;

     if (num > rounded_toward_zero * denom) {

       return rounded_toward_zero + 1;

     }

     return rounded_toward_zero;

   }


   void executeEval(Scalar* data) const {

     typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;


     const int maxSharedMem = m_device.sharedMemPerBlock();

     const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();

     const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;

     const int numMultiProcessors = m_device.getNumGpuMultiProcessors();

     const int warpSize = 32;


     switch (NumKernelDims) {

       case 1: {

         const int kernel_size = m_kernelImpl.dimensions().TotalSize();


         const int numX = dimensions()[m_indices[0]];

         const int numP = dimensions().TotalSize() / numX;

         int maxX;

         dim3 block_size;


         const int single_stride_dim =

             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : m_inputImpl.dimensions().rank() - 1;

         if (m_indices[0] == single_stride_dim) {

           // Maximum the reuse

           const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;

           maxX = numext::mini<int>(inner_dim, numX);

           const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);

           block_size.x = numext::mini(maxThreadsPerBlock, maxX);

           block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);

         } else {

           // Read as much as possible alongside the inner most dimension, that is the plane

           const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));

           const int maxP = numext::mini<int>(inner_dim, numP);

           maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);


           block_size.x = numext::mini(warpSize, maxX);

           block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);

         }


         const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);

         gpu_assert(shared_mem <= maxSharedMem);


         const int num_x_blocks = ceil(numX, maxX);

         const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);

         const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);


         dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));


         // cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << "

         // num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: "

         // << shared_mem << " in stream " << m_device.stream() << endl;


         const array<Index, 1> indices{m_indices[0]};

         const array<Index, 1> kernel_dims{m_kernelImpl.dimensions()[0]};

         internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);

         switch (kernel_size) {

           case 4: {

             LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>),

                               num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,

                               numX, maxX, 4, data);

             break;

           }

           case 7: {

             LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>),

                               num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,

                               numX, maxX, 7, data);

             break;

           }

           default: {

             LAUNCH_GPU_KERNEL(

                 (EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>),

                 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,

                 kernel_size, data);

           }

         }

         break;

       }


       case 2: {

         const int idxX = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;

         const int idxY = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;

         const int kernel_size_x = m_kernelImpl.dimensions()[idxX];

         const int kernel_size_y = m_kernelImpl.dimensions()[idxY];


         const int numX = dimensions()[m_indices[idxX]];

         const int numY = dimensions()[m_indices[idxY]];

         const int numP = dimensions().TotalSize() / (numX * numY);


         const float scaling_factor =

             sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));


         // Snap maxX to warp size

         int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;

         const int maxX = numext::mini<int>(inner_dim, numX);

         const int maxY =

             numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);

         const int maxP = numext::mini<int>(

             maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);


         dim3 block_size;

         block_size.x = numext::mini(1024, maxX);

         block_size.y = numext::mini<int>(1024 / block_size.x, maxY);

         block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxP);


         const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);

         gpu_assert(shared_mem <= maxSharedMem);


         const int num_x_blocks = ceil(numX, maxX);

         const int num_y_blocks = ceil(numY, maxY);

         const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);

         const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);


         dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));


         // cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << "

         // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y <<

         // " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << "

         // shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;


         const array<Index, 2> indices{m_indices[idxX], m_indices[idxY]};

         const array<Index, 2> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]};

         internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);

         switch (kernel_size_x) {

           case 4: {

             switch (kernel_size_y) {

               case 7: {

                 LAUNCH_GPU_KERNEL(

                     (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>),

                     num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,

                     numY, maxY, 4, 7, data);

                 break;

               }

               default: {

                 LAUNCH_GPU_KERNEL(

                     (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>),

                     num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,

                     numY, maxY, 4, kernel_size_y, data);

                 break;

               }

             }

             break;

           }

           case 7: {

             switch (kernel_size_y) {

               case 4: {

                 LAUNCH_GPU_KERNEL(

                     (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>),

                     num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,

                     numY, maxY, 7, 4, data);

                 break;

               }

               default: {

                 LAUNCH_GPU_KERNEL(

                     (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>),

                     num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,

                     numY, maxY, 7, kernel_size_y, data);

                 break;

               }

             }

             break;

           }

           default: {

             LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims,

                                                         Dynamic, Dynamic>),

                               num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,

                               numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);

             break;

           }

         }

         break;

       }


       case 3: {

         const int idxX = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;

         const int idxY = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;

         const int idxZ = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;


         const int kernel_size_x = m_kernelImpl.dimensions()[idxX];

         const int kernel_size_y = m_kernelImpl.dimensions()[idxY];

         const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];


         const int numX = dimensions()[m_indices[idxX]];

         const int numY = dimensions()[m_indices[idxY]];

         const int numZ = dimensions()[m_indices[idxZ]];

         const int numP = dimensions().TotalSize() / (numX * numY * numZ);


         const int maxX = numext::mini<int>(

             128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1,

                                    numX));

         const int maxY = numext::mini<int>(

             128, numext::mini<int>(

                      maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1,

                      numY));

         const int maxZ = numext::mini<int>(

             128, numext::mini<int>(

                      maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) -

                          kernel_size_z + 1,

                      numZ));


         dim3 block_size;

         block_size.x = numext::mini(32, maxX);

         block_size.y = numext::mini(32, maxY);

         block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxZ);

         dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));


         const int shared_mem =

             (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);

         gpu_assert(shared_mem <= maxSharedMem);


         // cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << "

         // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y <<

         // " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() <<

         // endl;

         const array<Index, 3> indices{m_indices[idxX], m_indices[idxY], m_indices[idxZ]};

         const array<Index, 3> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY],

                                           m_kernelImpl.dimensions()[idxZ]};

         internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);


         LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>),

                           num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX,

                           maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);

         break;

       }


       default: {

         EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),

                             THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);

       }

     }

   }


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {

     eigen_assert(m_buf);

     eigen_assert(index < m_dimensions.TotalSize());

     return m_buf[index];

   }


   template <int LoadMode>

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {

     eigen_assert(m_buf);

     eigen_assert(index < m_dimensions.TotalSize());

     return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);

   }


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {

     // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost

     // model.

     const double kernel_size = m_kernelImpl.dimensions().TotalSize();

     // We ignore the use of fused multiply-add.

     const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();

     const double firstIndex_compute_cost =

         NumDims *

         (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());

     return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +

            kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +

                           TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));

   }


  private:

   TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;

   TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;

   KernelArgType m_kernelArg;

   Indices m_indices;

   Dimensions m_dimensions;

   Scalar* m_buf;

   const Scalar* m_kernel;

   bool m_local_kernel;


   const GpuDevice& m_device;

 };

 #endif


 }  // end namespace Eigen


 #endif  // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H

i
int i
Definition: BiCGSTAB_step_by_step.cpp:9

n
const unsigned n
Definition: CG3DPackingUnitTest.cpp:11

EIGEN_ALIGN_MAX
#define EIGEN_ALIGN_MAX
Definition: ConfigureVectorization.h:146

EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892

EIGEN_HIP_LAUNCH_BOUNDS_1024
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition: Macros.h:576

eigen_assert
#define eigen_assert(x)
Definition: Macros.h:910

EIGEN_STRONG_INLINE
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834

EIGEN_STATIC_ASSERT
#define EIGEN_STATIC_ASSERT(X, MSG)
Definition: StaticAssert.h:26

EIGEN_DEVICE_REF
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:34

p
float * p
Definition: Tutorial_Map_using.cpp:9

Scalar
SCALAR Scalar
Definition: bench_gemm.cpp:45

Eigen::TensorBase
The tensor base class.
Definition: TensorBase.h:1026

Eigen::TensorConvolutionOp
Definition: TensorConvolution.h:236

Eigen::TensorConvolutionOp::inputExpression
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t< typename InputXprType::Nested > & inputExpression() const
Definition: TensorConvolution.h:253

Eigen::TensorConvolutionOp::Index
Eigen::internal::traits< TensorConvolutionOp >::Index Index
Definition: TensorConvolution.h:244

Eigen::TensorConvolutionOp::Scalar
Eigen::internal::traits< TensorConvolutionOp >::Scalar Scalar
Definition: TensorConvolution.h:238

Eigen::TensorConvolutionOp::kernelExpression
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t< typename KernelXprType::Nested > & kernelExpression() const
Definition: TensorConvolution.h:258

Eigen::TensorConvolutionOp::StorageKind
Eigen::internal::traits< TensorConvolutionOp >::StorageKind StorageKind
Definition: TensorConvolution.h:243

Eigen::TensorConvolutionOp::CoeffReturnType
internal::promote_storage_type< typename InputXprType::CoeffReturnType, typename KernelXprType::CoeffReturnType >::ret CoeffReturnType
Definition: TensorConvolution.h:241

Eigen::TensorConvolutionOp::TensorConvolutionOp
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType &input, const KernelXprType &kernel, const Indices &dims)
Definition: TensorConvolution.h:246

Eigen::TensorConvolutionOp::m_indices
const Indices m_indices
Definition: TensorConvolution.h:266

Eigen::TensorConvolutionOp::Nested
Eigen::internal::nested< TensorConvolutionOp >::type Nested
Definition: TensorConvolution.h:242

Eigen::TensorConvolutionOp::RealScalar
Eigen::NumTraits< Scalar >::Real RealScalar
Definition: TensorConvolution.h:239

Eigen::TensorConvolutionOp::indices
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices & indices() const
Definition: TensorConvolution.h:250

Eigen::TensorConvolutionOp::m_input_xpr
InputXprType::Nested m_input_xpr
Definition: TensorConvolution.h:264

Eigen::TensorConvolutionOp::m_kernel_xpr
KernelXprType::Nested m_kernel_xpr
Definition: TensorConvolution.h:265

Eigen::TensorEvalToOp
Definition: TensorEvalTo.h:61

Eigen::TensorOpCost
Definition: TensorCostModel.h:28

Eigen::Triplet< double >

Eigen::internal::IndexMapper
Definition: TensorConvolution.h:28

Eigen::internal::IndexMapper::m_inputStrides
array< Index, NumDims > m_inputStrides
Definition: TensorConvolution.h:194

Eigen::internal::IndexMapper::mapGpuOutputPlaneToTensorOutputOffset
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const
Definition: TensorConvolution.h:136

Eigen::internal::IndexMapper::m_gpuOutputStrides
array< Index, NumDims > m_gpuOutputStrides
Definition: TensorConvolution.h:197

Eigen::internal::IndexMapper::m_outputStrides
array< Index, NumDims > m_outputStrides
Definition: TensorConvolution.h:195

Eigen::internal::IndexMapper::mapGpuInputKernelToTensorInputOffset
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const
Definition: TensorConvolution.h:182

Eigen::internal::IndexMapper::mapGpuInputKernelToTensorInputOffset
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const
Definition: TensorConvolution.h:162

Eigen::internal::IndexMapper::mapGpuOutputKernelToTensorOutputOffset
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const
Definition: TensorConvolution.h:187

Eigen::internal::IndexMapper::IndexMapper
IndexMapper(const InputDims &input_dims, const array< Index, NumKernelDims > &kernel_dims, const array< Index, NumKernelDims > &indices)
Definition: TensorConvolution.h:30

Eigen::internal::IndexMapper::NumDims
static constexpr int NumDims
Definition: TensorConvolution.h:193

Eigen::internal::IndexMapper::mapGpuOutputKernelToTensorOutputOffset
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const
Definition: TensorConvolution.h:177

Eigen::internal::IndexMapper::m_gpuInputStrides
array< Index, NumDims > m_gpuInputStrides
Definition: TensorConvolution.h:196

Eigen::internal::IndexMapper::mapGpuInputPlaneToTensorInputOffset
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const
Definition: TensorConvolution.h:110

Eigen::internal::IndexMapper::mapGpuInputKernelToTensorInputOffset
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const
Definition: TensorConvolution.h:172

Eigen::internal::IndexMapper::mapGpuOutputKernelToTensorOutputOffset
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const
Definition: TensorConvolution.h:167

Eigen::internal::TensorBlockNotImplemented
Definition: TensorBlock.h:566

Eigen::internal::TensorExecutor::run
static EIGEN_STRONG_INLINE void run(const Expression &expr, const Device &device=DefaultDevice())
Definition: TensorExecutor.h:92

threadIdx
dim3 threadIdx
Definition: gpu_common.h:16

blockDim
dim3 blockDim
Definition: gpu_common.h:16

blockIdx
dim3 blockIdx
Definition: gpu_common.h:16

Eigen::ColMajor
@ ColMajor
Definition: Constants.h:318

s
RealScalar s
Definition: level1_cplx_impl.h:130

int
return int(ret)+1

ret
Eigen::DenseIndex ret
Definition: level1_cplx_impl.h:43

m
int * m
Definition: level2_cplx_impl.h:294

k
char char char int int * k
Definition: level2_impl.h:374

op
char char * op
Definition: level2_impl.h:374

tmp
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365

Eigen::TensorSycl::internal::contraction_type::local
@ local

Eigen::bfloat16_impl::ceil
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16 &a)
Definition: BFloat16.h:644

Eigen::internal::remove_all_t
typename remove_all< T >::type remove_all_t
Definition: Meta.h:142

Eigen::numext::mini
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition: MathFunctions.h:920

Eigen
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70

Eigen::array
std::array< T, N > array
Definition: EmulateArray.h:231

Eigen::value
squared absolute value
Definition: GlobalFunctions.h:87

Eigen::Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83

Eigen::Dynamic
const int Dynamic
Definition: Constants.h:25

Flags
Extend namespace for flags.
Definition: fsi_chan_precond_driver.cc:56

calibrate.val
val
Definition: calibrate.py:119

compute_granudrum_aor.type
type
Definition: compute_granudrum_aor.py:141

fix_broken_doxygen_formulae.in_place
in_place
Definition: fix_broken_doxygen_formulae.py:249

internal
Definition: Eigen_Colamd.h:49

XprType
CwiseBinaryOp< internal::scalar_sum_op< double, double >, const CpyMatrixXd, const CpyMatrixXd > XprType
Definition: nestbyvalue.cpp:15

eval
internal::nested_eval< T, 1 >::type eval(const T &xpr)
Definition: sparse_permutations.cpp:47

Eigen::DSizes< Index, NumDims >

Eigen::Dense
Definition: Constants.h:519

Eigen::GenericNumTraits::Real
T Real
Definition: NumTraits.h:183

Eigen::PacketType
Definition: TensorMeta.h:47

Eigen::PacketType::type
internal::packet_traits< Scalar >::type type
Definition: TensorMeta.h:48

Eigen::StorageMemory
Definition: TensorForwardDeclarations.h:42

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::Dimensions
DSizes< Index, NumDims > Dimensions
Definition: TensorConvolution.h:277

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_dimensions
Dimensions m_dimensions
Definition: TensorConvolution.h:531

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::coeff
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition: TensorConvolution.h:394

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_inputImpl
TensorEvaluator< InputArgType, Device > m_inputImpl
Definition: TensorConvolution.h:529

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::convolve
EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType &accum) const
Definition: TensorConvolution.h:478

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::TensorEvaluator
EIGEN_STRONG_INLINE TensorEvaluator(const XprType &op, const Device &device)
Definition: TensorConvolution.h:302

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::packet
EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
Definition: TensorConvolution.h:401

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::PacketReturnType
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition: TensorConvolution.h:281

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::convolvePacket
EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet &accum) const
Definition: TensorConvolution.h:491

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_inputStride
array< Index, NumDims > m_inputStride
Definition: TensorConvolution.h:524

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::CoeffReturnType
XprType::CoeffReturnType CoeffReturnType
Definition: TensorConvolution.h:280

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_kernelArg
KernelArgType m_kernelArg
Definition: TensorConvolution.h:533

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_kernelStride
array< Index, NumKernelDims > m_kernelStride
Definition: TensorConvolution.h:528

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::cleanup
EIGEN_STRONG_INLINE void cleanup()
Definition: TensorConvolution.h:377

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::Index
XprType::Index Index
Definition: TensorConvolution.h:276

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::TensorBlock
internal::TensorBlockNotImplemented TensorBlock
Definition: TensorConvolution.h:299

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::costPerCoeff
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
Definition: TensorConvolution.h:444

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::firstInput
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const
Definition: TensorConvolution.h:459

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::preloadKernel
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel()
Definition: TensorConvolution.h:504

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::Scalar
XprType::Scalar Scalar
Definition: TensorConvolution.h:279

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::data
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition: TensorConvolution.h:456

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_kernel
const Scalar * m_kernel
Definition: TensorConvolution.h:534

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::dimensions
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition: TensorConvolution.h:370

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_kernelImpl
TensorEvaluator< KernelArgType, Device > m_kernelImpl
Definition: TensorConvolution.h:530

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::Storage
StorageMemory< Scalar, Device > Storage
Definition: TensorConvolution.h:283

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_outputStride
array< Index, NumDims > m_outputStride
Definition: TensorConvolution.h:525

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_local_kernel
bool m_local_kernel
Definition: TensorConvolution.h:535

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::EvaluatorPointerType
Storage::Type EvaluatorPointerType
Definition: TensorConvolution.h:284

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_indexStride
array< Index, NumKernelDims > m_indexStride
Definition: TensorConvolution.h:527

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::XprType
TensorConvolutionOp< Indices, InputArgType, KernelArgType > XprType
Definition: TensorConvolution.h:271

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::evalTo
void evalTo(typename XprType::Scalar *buffer)
Definition: TensorConvolution.h:386

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::m_device
const Device EIGEN_DEVICE_REF m_device
Definition: TensorConvolution.h:536

Eigen::TensorEvaluator< const TensorConvolutionOp< Indices, InputArgType, KernelArgType >, Device >::evalSubExprsIfNeeded
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar *)
Definition: TensorConvolution.h:372

Eigen::TensorEvaluator
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31

Eigen::TensorEvaluator::Layout
static constexpr int Layout
Definition: TensorEvaluator.h:46

Eigen::TensorEvaluator::evalSubExprsIfNeeded
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest)
Definition: TensorEvaluator.h:71

Eigen::TensorEvaluator::Scalar
Derived::Scalar Scalar
Definition: TensorEvaluator.h:33

Eigen::TensorEvaluator::m_device
const Device EIGEN_DEVICE_REF m_device
Definition: TensorEvaluator.h:170

Eigen::TensorEvaluator::TensorEvaluator
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived &m, const Device &device)
Definition: TensorEvaluator.h:66

Eigen::TensorEvaluator::PacketAccess
@ PacketAccess
Definition: TensorEvaluator.h:50

Eigen::TensorEvaluator::IsAligned
@ IsAligned
Definition: TensorEvaluator.h:49

Eigen::TensorEvaluator::PacketSize
static constexpr int PacketSize
Definition: TensorEvaluator.h:38

Eigen::TensorEvaluator::data
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition: TensorEvaluator.h:165

Eigen::TensorEvaluator::CoeffReturnType
Derived::Scalar CoeffReturnType
Definition: TensorEvaluator.h:34

Eigen::TensorEvaluator::coeff
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition: TensorEvaluator.h:89

Eigen::TensorEvaluator::cleanup
EIGEN_STRONG_INLINE void cleanup()
Definition: TensorEvaluator.h:87

Eigen::TensorEvaluator::XprType
Derived XprType
Definition: TensorEvaluator.h:37

Eigen::TensorEvaluator::Index
Derived::Index Index
Definition: TensorEvaluator.h:32

Eigen::TensorEvaluator::TensorBlock
internal::TensorMaterializedBlock< ScalarNoConst, NumCoords, Layout, Index > TensorBlock
Definition: TensorEvaluator.h:63

Eigen::TensorEvaluator::PacketReturnType
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition: TensorEvaluator.h:35

Eigen::TensorEvaluator::Dimensions
Derived::Dimensions Dimensions
Definition: TensorEvaluator.h:36

Eigen::TensorEvaluator::costPerCoeff
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
Definition: TensorEvaluator.h:139

Eigen::TensorEvaluator::dimensions
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition: TensorEvaluator.h:69

Eigen::TensorEvaluator::packet
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
Definition: TensorEvaluator.h:100

Eigen::internal::IsVectorizable
Definition: TensorForwardDeclarations.h:175

Eigen::internal::IsVectorizable::value
static const bool value
Definition: TensorForwardDeclarations.h:176

Eigen::internal::array_size
Definition: Meta.h:305

Eigen::internal::array_size::value
static constexpr Index value
Definition: Meta.h:306

Eigen::internal::eval< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType >, Eigen::Dense >::type
const TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > & type
Definition: TensorConvolution.h:223

Eigen::internal::eval
Definition: XprHelper.h:427

Eigen::internal::nested< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType >, 1, typename eval< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > >::type >::type
TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > type
Definition: TensorConvolution.h:229

Eigen::internal::nested
Definition: TensorTraits.h:152

Eigen::internal::nested::type
ref_selector< T >::type type
Definition: TensorTraits.h:153

Eigen::internal::promote_index_type
Definition: XprHelper.h:145

Eigen::internal::promote_storage_type
Definition: XprHelper.h:591

Eigen::internal::traits< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > >::StorageKind
promote_storage_type< typename traits< InputXprType >::StorageKind, typename traits< KernelXprType >::StorageKind >::ret StorageKind
Definition: TensorConvolution.h:205

Eigen::internal::traits< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > >::LhsNested
InputXprType::Nested LhsNested
Definition: TensorConvolution.h:208

Eigen::internal::traits< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > >::RhsNested
KernelXprType::Nested RhsNested
Definition: TensorConvolution.h:209

Eigen::internal::traits< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > >::Index
promote_index_type< typename traits< InputXprType >::Index, typename traits< KernelXprType >::Index >::type Index
Definition: TensorConvolution.h:207

Eigen::internal::traits< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > >::PointerType
std::conditional_t< Pointer_type_promotion< typename InputXprType::Scalar, Scalar >::val, typename traits< InputXprType >::PointerType, typename traits< KernelXprType >::PointerType > PointerType
Definition: TensorConvolution.h:216

Eigen::internal::traits< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > >::Scalar
promote_storage_type< typename InputXprType::Scalar, typename KernelXprType::Scalar >::ret Scalar
Definition: TensorConvolution.h:203

Eigen::internal::traits< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > >::RhsNested_
std::remove_reference_t< RhsNested > RhsNested_
Definition: TensorConvolution.h:211

Eigen::internal::traits< TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > >::LhsNested_
std::remove_reference_t< LhsNested > LhsNested_
Definition: TensorConvolution.h:210

Eigen::internal::traits
Definition: ForwardDeclarations.h:21

Eigen::internal::unpacket_traits::size
@ size
Definition: GenericPacketMath.h:139

j
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2

InternalHeaderCheck.h