TensorConvolution.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
12 
13 // IWYU pragma: private
14 #include "./InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
25 namespace internal {
26 
27 template <typename Index, typename InputDims, int NumKernelDims, int Layout>
28 class IndexMapper {
29  public:
30  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
31  const array<Index, NumKernelDims>& indices) {
32  array<Index, NumDims> dimensions = input_dims;
33  for (int i = 0; i < NumKernelDims; ++i) {
34  const Index index = indices[i];
35  const Index input_dim = input_dims[index];
36  const Index kernel_dim = kernel_dims[i];
37  const Index result_dim = input_dim - kernel_dim + 1;
38  dimensions[index] = result_dim;
39  }
40 
41  array<Index, NumDims> inputStrides;
42  array<Index, NumDims> outputStrides;
43  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
44  inputStrides[0] = 1;
45  outputStrides[0] = 1;
46  for (int i = 1; i < NumDims; ++i) {
47  inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1];
48  outputStrides[i] = outputStrides[i - 1] * dimensions[i - 1];
49  }
50  } else {
51  inputStrides[NumDims - 1] = 1;
52  outputStrides[NumDims - 1] = 1;
53  for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
54  inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
55  outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
56  }
57  }
58 
59  array<Index, NumDims> gpuInputDimensions;
60  array<Index, NumDims> gpuOutputDimensions;
61  array<Index, NumDims> tmp = dimensions;
62  array<Index, NumDims> ordering;
63  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
64  for (int i = 0; i < NumKernelDims; ++i) {
65  const Index index = i + offset;
66  ordering[index] = indices[i];
67  tmp[indices[i]] = -1;
68  gpuInputDimensions[index] = input_dims[indices[i]];
69  gpuOutputDimensions[index] = dimensions[indices[i]];
70  }
71 
72  int written = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? NumKernelDims : 0;
73  for (int i = 0; i < NumDims; ++i) {
74  if (tmp[i] >= 0) {
75  ordering[written] = i;
76  gpuInputDimensions[written] = input_dims[i];
77  gpuOutputDimensions[written] = dimensions[i];
78  ++written;
79  }
80  }
81 
82  for (int i = 0; i < NumDims; ++i) {
83  m_inputStrides[i] = inputStrides[ordering[i]];
84  m_outputStrides[i] = outputStrides[ordering[i]];
85  }
86 
87  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
88  for (int i = 0; i < NumDims; ++i) {
89  if (i > NumKernelDims) {
90  m_gpuInputStrides[i] = m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
91  m_gpuOutputStrides[i] = m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
92  } else {
93  m_gpuInputStrides[i] = 1;
94  m_gpuOutputStrides[i] = 1;
95  }
96  }
97  } else {
98  for (int i = NumDims - 1; i >= 0; --i) {
99  if (static_cast<size_t>(i + 1) < offset) {
100  m_gpuInputStrides[i] = m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
101  m_gpuOutputStrides[i] = m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
102  } else {
103  m_gpuInputStrides[i] = 1;
104  m_gpuOutputStrides[i] = 1;
105  }
106  }
107  }
108  }
109 
111  Index inputIndex = 0;
112  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
113  for (int d = NumDims - 1; d > NumKernelDims; --d) {
114  const Index idx = p / m_gpuInputStrides[d];
115  inputIndex += idx * m_inputStrides[d];
116  p -= idx * m_gpuInputStrides[d];
117  }
118  if (NumKernelDims < NumDims) {
119  inputIndex += p * m_inputStrides[NumKernelDims];
120  }
121  } else {
122  std::ptrdiff_t limit = 0;
123  if (NumKernelDims < NumDims) {
124  limit = NumDims - NumKernelDims - 1;
125  }
126  for (int d = 0; d < limit; ++d) {
127  const Index idx = p / m_gpuInputStrides[d];
128  inputIndex += idx * m_inputStrides[d];
129  p -= idx * m_gpuInputStrides[d];
130  }
131  inputIndex += p * m_inputStrides[limit];
132  }
133  return inputIndex;
134  }
135 
137  Index outputIndex = 0;
138  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
139  for (int d = NumDims - 1; d > NumKernelDims; --d) {
140  const Index idx = p / m_gpuOutputStrides[d];
141  outputIndex += idx * m_outputStrides[d];
142  p -= idx * m_gpuOutputStrides[d];
143  }
144  if (NumKernelDims < NumDims) {
145  outputIndex += p * m_outputStrides[NumKernelDims];
146  }
147  } else {
148  std::ptrdiff_t limit = 0;
149  if (NumKernelDims < NumDims) {
150  limit = NumDims - NumKernelDims - 1;
151  }
152  for (int d = 0; d < limit; ++d) {
153  const Index idx = p / m_gpuOutputStrides[d];
154  outputIndex += idx * m_outputStrides[d];
155  p -= idx * m_gpuOutputStrides[d];
156  }
157  outputIndex += p * m_outputStrides[limit];
158  }
159  return outputIndex;
160  }
161 
163  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
164  return i * m_inputStrides[offset];
165  }
166 
168  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
169  return i * m_outputStrides[offset];
170  }
171 
173  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
174  return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
175  }
176 
178  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
179  return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
180  }
181 
183  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
184  return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + k * m_inputStrides[offset + 2];
185  }
186 
188  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
189  return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + k * m_outputStrides[offset + 2];
190  }
191 
192  private:
198 };
199 
200 template <typename Dimensions, typename InputXprType, typename KernelXprType>
201 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> > {
202  // Type promotion to handle the case where the types of the lhs and the rhs are different.
208  typedef typename InputXprType::Nested LhsNested;
209  typedef typename KernelXprType::Nested RhsNested;
210  typedef std::remove_reference_t<LhsNested> LhsNested_;
211  typedef std::remove_reference_t<RhsNested> RhsNested_;
212  static constexpr int NumDimensions = traits<InputXprType>::NumDimensions;
213  static constexpr int Layout = traits<InputXprType>::Layout;
217 
218  enum { Flags = 0 };
219 };
220 
221 template <typename Dimensions, typename InputXprType, typename KernelXprType>
222 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense> {
224 };
225 
226 template <typename Dimensions, typename InputXprType, typename KernelXprType>
227 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1,
228  typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type> {
230 };
231 
232 } // end namespace internal
233 
234 template <typename Indices, typename InputXprType, typename KernelXprType>
236  : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors> {
237  public:
240  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
241  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
245 
246  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel,
247  const Indices& dims)
248  : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
249 
250  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices& indices() const { return m_indices; }
251 
254  const {
255  return m_input_xpr;
256  }
257 
259  const {
260  return m_kernel_xpr;
261  }
262 
263  protected:
264  typename InputXprType::Nested m_input_xpr;
265  typename KernelXprType::Nested m_kernel_xpr;
266  const Indices m_indices;
267 };
268 
269 template <typename Indices, typename InputArgType, typename KernelArgType, typename Device>
270 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device> {
272 
273  static constexpr int NumDims =
275  static constexpr int NumKernelDims = internal::array_size<Indices>::value;
276  typedef typename XprType::Index Index;
278 
279  typedef typename XprType::Scalar Scalar;
285 
287  enum {
288  IsAligned =
292  BlockAccess = false,
293  PreferBlockAccess = false,
294  CoordAccess = false, // to be implemented
295  RawAccess = false
296  };
297 
298  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
300  //===--------------------------------------------------------------------===//
301 
302  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
303  : m_inputImpl(op.inputExpression(), device),
304  m_kernelImpl(op.kernelExpression(), device),
305  m_kernelArg(op.kernelExpression()),
306  m_kernel(NULL),
307  m_local_kernel(false),
308  m_device(device) {
311  YOU_MADE_A_PROGRAMMING_MISTAKE);
312 
313  const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
314  const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
315 
316  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
317  m_inputStride[0] = 1;
318  for (int i = 1; i < NumDims; ++i) {
319  m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
320  }
321  } else {
322  m_inputStride[NumDims - 1] = 1;
323  for (int i = NumDims - 2; i >= 0; --i) {
324  m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
325  }
326  }
327 
328  m_dimensions = m_inputImpl.dimensions();
329  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
330  for (int i = 0; i < NumKernelDims; ++i) {
331  const Index index = op.indices()[i];
332  const Index input_dim = input_dims[index];
333  const Index kernel_dim = kernel_dims[i];
334  const Index result_dim = input_dim - kernel_dim + 1;
335  m_dimensions[index] = result_dim;
336  if (i > 0) {
337  m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
338  } else {
339  m_kernelStride[0] = 1;
340  }
341  m_indexStride[i] = m_inputStride[index];
342  }
343 
344  m_outputStride[0] = 1;
345  for (int i = 1; i < NumDims; ++i) {
346  m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
347  }
348  } else {
349  for (int i = NumKernelDims - 1; i >= 0; --i) {
350  const Index index = op.indices()[i];
351  const Index input_dim = input_dims[index];
352  const Index kernel_dim = kernel_dims[i];
353  const Index result_dim = input_dim - kernel_dim + 1;
354  m_dimensions[index] = result_dim;
355  if (i < NumKernelDims - 1) {
356  m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
357  } else {
358  m_kernelStride[NumKernelDims - 1] = 1;
359  }
360  m_indexStride[i] = m_inputStride[index];
361  }
362 
363  m_outputStride[NumDims - 1] = 1;
364  for (int i = NumDims - 2; i >= 0; --i) {
365  m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
366  }
367  }
368  }
369 
370  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
371 
373  m_inputImpl.evalSubExprsIfNeeded(NULL);
374  preloadKernel();
375  return true;
376  }
378  m_inputImpl.cleanup();
379  if (m_local_kernel) {
380  m_device.deallocate((void*)m_kernel);
381  m_local_kernel = false;
382  }
383  m_kernel = NULL;
384  }
385 
386  void evalTo(typename XprType::Scalar* buffer) {
387  evalSubExprsIfNeeded(NULL);
388  for (int i = 0; i < dimensions().TotalSize(); ++i) {
389  buffer[i] += coeff(i);
390  }
391  cleanup();
392  }
393 
395  CoeffReturnType result = CoeffReturnType(0);
396  convolve(firstInput(index), 0, NumKernelDims - 1, result);
397  return result;
398  }
399 
400  template <int LoadMode>
402  Index indices[2] = {index, index + PacketSize - 1};
403  Index startInputs[2] = {0, 0};
404  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
405  for (int i = NumDims - 1; i > 0; --i) {
406  const Index idx0 = indices[0] / m_outputStride[i];
407  const Index idx1 = indices[1] / m_outputStride[i];
408  startInputs[0] += idx0 * m_inputStride[i];
409  startInputs[1] += idx1 * m_inputStride[i];
410  indices[0] -= idx0 * m_outputStride[i];
411  indices[1] -= idx1 * m_outputStride[i];
412  }
413  } else {
414  for (int i = 0; i < NumDims - 1; ++i) {
415  const Index idx0 = indices[0] / m_outputStride[i];
416  const Index idx1 = indices[1] / m_outputStride[i];
417  startInputs[0] += idx0 * m_inputStride[i];
418  startInputs[1] += idx1 * m_inputStride[i];
419  indices[0] -= idx0 * m_outputStride[i];
420  indices[1] -= idx1 * m_outputStride[i];
421  }
422  }
423  startInputs[0] += indices[0];
424  startInputs[1] += indices[1];
425 
426  if (startInputs[1] - startInputs[0] == PacketSize - 1) {
427  PacketReturnType result = internal::pset1<PacketReturnType>(0);
428  convolvePacket(startInputs[0], 0, NumKernelDims - 1, result);
429  return result;
430  } else {
432  data[0] = Scalar(0);
433  convolve(startInputs[0], 0, NumKernelDims - 1, data[0]);
434  for (int i = 1; i < PacketSize - 1; ++i) {
435  data[i] = Scalar(0);
436  convolve(firstInput(index + i), 0, NumKernelDims - 1, data[i]);
437  }
438  data[PacketSize - 1] = Scalar(0);
439  convolve(startInputs[1], 0, NumKernelDims - 1, data[PacketSize - 1]);
440  return internal::pload<PacketReturnType>(data);
441  }
442  }
443 
445  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
446  // We ignore the use of fused multiply-add.
447  const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
448  const double firstIndex_compute_cost =
449  NumDims *
450  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
451  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
452  kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
453  TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
454  }
455 
456  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
457 
458  private:
460  Index startInput = 0;
461  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
462  for (int i = NumDims - 1; i > 0; --i) {
463  const Index idx = index / m_outputStride[i];
464  startInput += idx * m_inputStride[i];
465  index -= idx * m_outputStride[i];
466  }
467  } else {
468  for (int i = 0; i < NumDims - 1; ++i) {
469  const Index idx = index / m_outputStride[i];
470  startInput += idx * m_inputStride[i];
471  index -= idx * m_outputStride[i];
472  }
473  }
474  startInput += index;
475  return startInput;
476  }
477 
478  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
479  for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
480  const Index input = firstIndex + j * m_indexStride[DimIndex];
481  const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
482  if (DimIndex > 0) {
483  convolve(input, kernel, DimIndex - 1, accum);
484  } else {
485  accum += m_inputImpl.coeff(input) * m_kernel[kernel];
486  }
487  }
488  }
489 
490  template <typename Packet>
491  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
492  for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
493  const Index input = firstIndex + j * m_indexStride[DimIndex];
494  const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
495  if (DimIndex > 0) {
496  convolvePacket(input, kernel, DimIndex - 1, accum);
497  } else {
498  accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input),
499  internal::pset1<Packet>(m_kernel[kernel]), accum);
500  }
501  }
502  }
503 
505  // Don't make a local copy of the kernel unless we have to (i.e. it's an
506  // expression that needs to be evaluated)
507  const Scalar* in_place = m_kernelImpl.data();
508  if (in_place) {
509  m_kernel = in_place;
510  m_local_kernel = false;
511  } else {
512  size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
513  Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
515  EvalTo evalToTmp(local, m_kernelArg);
518 
519  m_kernel = local;
520  m_local_kernel = true;
521  }
522  }
523 
526 
532 
533  KernelArgType m_kernelArg;
534  const Scalar* m_kernel;
537 };
538 
539 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
540 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
541 
542 template <int StaticKernelSize>
543 struct GetKernelSize {
544  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int /*kernelSize*/) const { return StaticKernelSize; }
545 };
546 template <>
547 struct GetKernelSize<Dynamic> {
548  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int kernelSize) const { return kernelSize; }
549 };
550 
551 template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize>
552 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
553  InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> indexMapper,
554  const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize,
555  float* buffer) {
556 #if defined(EIGEN_HIPCC)
557  HIP_DYNAMIC_SHARED(float, s)
558 #else
559  extern __shared__ float s[];
560 #endif
561 
562  const int first_x = blockIdx.x * maxX;
563  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
564  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
565  const int num_x_output = last_x - first_x + 1;
566 
567  const int first_plane = blockIdx.y * blockDim.y;
568  const int plane_stride = blockDim.y * gridDim.y;
569 
570  for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
571  // Load inputs to shared memory
572  const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
573  const int plane_kernel_offset = threadIdx.y * num_x_input;
574 #pragma unroll
575  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
576  const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x);
577  s[i + plane_kernel_offset] = eval.coeff(tensor_index);
578  }
579 
580  __syncthreads();
581 
582  // Compute the convolution
583  const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
584 
585 #pragma unroll
586  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
587  const int kernel_offset = plane_kernel_offset + i;
588  float result = 0.0f;
589 #pragma unroll
590  for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
591  result += s[k + kernel_offset] * kernel[k];
592  }
593  const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x);
594  buffer[tensor_index] = result;
595  }
596  __syncthreads();
597  }
598 };
599 
600 template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY>
601 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
602  InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> indexMapper,
603  const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY,
604  const int kernelSizeX, const int kernelSizeY, float* buffer) {
605 #if defined(EIGEN_HIPCC)
606  HIP_DYNAMIC_SHARED(float, s)
607 #else
608  extern __shared__ float s[];
609 #endif
610 
611  const int first_x = blockIdx.x * maxX;
612  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
613  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
614  const int num_x_output = last_x - first_x + 1;
615 
616  const int first_y = blockIdx.y * maxY;
617  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
618  const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
619  const int num_y_output = last_y - first_y + 1;
620 
621  const int first_plane = blockIdx.z * blockDim.z;
622  const int plane_stride = blockDim.z * gridDim.z;
623 
624  for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
625  const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
626  const int plane_kernel_offset = threadIdx.z * num_y_input;
627 
628 // Load inputs to shared memory
629 #pragma unroll
630  for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
631  const int input_offset = num_x_input * (j + plane_kernel_offset);
632 #pragma unroll
633  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
634  const int tensor_index =
635  plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x, j + first_y);
636  s[i + input_offset] = eval.coeff(tensor_index);
637  }
638  }
639 
640  __syncthreads();
641 
642  // Convolution
643  const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
644 
645 #pragma unroll
646  for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
647 #pragma unroll
648  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
649  float result = 0.0f;
650 #pragma unroll
651  for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
652  const int kernel_offset = kernelSizeX * l;
653  const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
654 #pragma unroll
655  for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
656  result += s[k + input_offset] * kernel[k + kernel_offset];
657  }
658  }
659  const int tensor_index =
660  plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x, j + first_y);
661  buffer[tensor_index] = result;
662  }
663  }
664 
665  __syncthreads();
666  }
667 };
668 
669 template <typename InputEvaluator, typename Index, typename InputDims>
670 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
671  InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> indexMapper,
672  const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY,
673  const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
674  const size_t kernelSizeZ, float* buffer) {
675 #if defined(EIGEN_HIPCC)
676  HIP_DYNAMIC_SHARED(float, s)
677 #else
678  extern __shared__ float s[];
679 #endif
680 
681  // Load inputs to shared memory
682  const int first_x = blockIdx.x * maxX;
683  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
684  const int num_x_input = last_x - first_x + kernelSizeX;
685 
686  const int first_y = blockIdx.y * maxY;
687  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
688  const int num_y_input = last_y - first_y + kernelSizeY;
689 
690  const int first_z = blockIdx.z * maxZ;
691  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
692  const int num_z_input = last_z - first_z + kernelSizeZ;
693 
694  for (int p = 0; p < numPlanes; ++p) {
695  const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
696  const int plane_kernel_offset = 0;
697 
698  for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
699  for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
700  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
701  const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
702  i + first_x, j + first_y, k + first_z);
703  s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
704  }
705  }
706  }
707 
708  __syncthreads();
709 
710  // Convolution
711  const int num_z_output = last_z - first_z + 1;
712  const int num_y_output = last_y - first_y + 1;
713  const int num_x_output = last_x - first_x + 1;
714  const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
715 
716  for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
717  for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
718  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
719  float result = 0.0f;
720  for (int n = 0; n < kernelSizeZ; ++n) {
721  for (int m = 0; m < kernelSizeY; ++m) {
722  for (int l = 0; l < kernelSizeX; ++l) {
723  result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] *
724  kernel[l + kernelSizeX * (m + kernelSizeY * n)];
725  }
726  }
727  }
728  const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(
729  i + first_x, j + first_y, k + first_z);
730  buffer[tensor_index] = result;
731  }
732  }
733  }
734  __syncthreads();
735  }
736 };
737 
738 template <typename Indices, typename InputArgType, typename KernelArgType>
739 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice> {
740  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
741 
742  static constexpr int NumDims =
743  internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
744  static constexpr int NumKernelDims = internal::array_size<Indices>::value;
745  typedef typename XprType::Index Index;
746  typedef DSizes<Index, NumDims> Dimensions;
747  typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
748 
750  enum {
751  IsAligned =
753  PacketAccess = false,
754  BlockAccess = false,
755  PreferBlockAccess = false,
756  CoordAccess = false, // to be implemented
757  RawAccess = false
758  };
759 
760  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
761  typedef internal::TensorBlockNotImplemented TensorBlock;
762  //===--------------------------------------------------------------------===//
763 
764  TensorEvaluator(const XprType& op, const GpuDevice& device)
765  : m_inputImpl(op.inputExpression(), device),
766  m_kernelImpl(op.kernelExpression(), device),
767  m_kernelArg(op.kernelExpression()),
768  m_indices(op.indices()),
769  m_buf(NULL),
770  m_kernel(NULL),
771  m_local_kernel(false),
772  m_device(device) {
775  YOU_MADE_A_PROGRAMMING_MISTAKE);
776 
777  const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
778  const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
779 
780  m_dimensions = m_inputImpl.dimensions();
781  for (int i = 0; i < NumKernelDims; ++i) {
782  const Index index = op.indices()[i];
783  const Index input_dim = input_dims[index];
784  const Index kernel_dim = kernel_dims[i];
785  const Index result_dim = input_dim - kernel_dim + 1;
786  m_dimensions[index] = result_dim;
787  }
788  }
789 
790  typedef typename XprType::CoeffReturnType CoeffReturnType;
792  typedef typename InputArgType::Scalar Scalar;
794 
795  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
796 
798  preloadKernel();
799  m_inputImpl.evalSubExprsIfNeeded(NULL);
800  if (data) {
801  executeEval(data);
802  return false;
803  } else {
804  m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
805  executeEval(m_buf);
806  return true;
807  }
808  }
809 
811  m_inputImpl.cleanup();
812  if (m_buf) {
813  m_device.deallocate(m_buf);
814  m_buf = NULL;
815  }
816  if (m_local_kernel) {
817  m_device.deallocate((void*)m_kernel);
818  m_local_kernel = false;
819  }
820  m_kernel = NULL;
821  }
822 
823  EIGEN_STRONG_INLINE void preloadKernel() {
824  // Don't make a local copy of the kernel unless we have to (i.e. it's an
825  // expression that needs to be evaluated)
826  const Scalar* in_place = m_kernelImpl.data();
827  if (in_place) {
828  m_kernel = in_place;
829  m_local_kernel = false;
830  } else {
831  size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
832  Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
833  typedef TensorEvalToOp<const KernelArgType> EvalTo;
834  EvalTo evalToTmp(local, m_kernelArg);
837 
838  m_kernel = local;
839  m_local_kernel = true;
840  }
841  }
842 
843  static unsigned int ceil(unsigned int num, unsigned int denom) {
844  const unsigned int rounded_toward_zero = num / denom;
845  if (num > rounded_toward_zero * denom) {
846  return rounded_toward_zero + 1;
847  }
848  return rounded_toward_zero;
849  }
850 
851  void executeEval(Scalar* data) const {
852  typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
853 
854  const int maxSharedMem = m_device.sharedMemPerBlock();
855  const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
856  const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
857  const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
858  const int warpSize = 32;
859 
860  switch (NumKernelDims) {
861  case 1: {
862  const int kernel_size = m_kernelImpl.dimensions().TotalSize();
863 
864  const int numX = dimensions()[m_indices[0]];
865  const int numP = dimensions().TotalSize() / numX;
866  int maxX;
867  dim3 block_size;
868 
869  const int single_stride_dim =
870  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : m_inputImpl.dimensions().rank() - 1;
871  if (m_indices[0] == single_stride_dim) {
872  // Maximum the reuse
873  const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
874  maxX = numext::mini<int>(inner_dim, numX);
875  const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
876  block_size.x = numext::mini(maxThreadsPerBlock, maxX);
877  block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
878  } else {
879  // Read as much as possible alongside the inner most dimension, that is the plane
880  const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
881  const int maxP = numext::mini<int>(inner_dim, numP);
882  maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
883 
884  block_size.x = numext::mini(warpSize, maxX);
885  block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
886  }
887 
888  const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
889  gpu_assert(shared_mem <= maxSharedMem);
890 
891  const int num_x_blocks = ceil(numX, maxX);
892  const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
893  const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
894 
895  dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
896 
897  // cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << "
898  // num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: "
899  // << shared_mem << " in stream " << m_device.stream() << endl;
900 
901  const array<Index, 1> indices{m_indices[0]};
902  const array<Index, 1> kernel_dims{m_kernelImpl.dimensions()[0]};
903  internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
904  switch (kernel_size) {
905  case 4: {
906  LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>),
907  num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
908  numX, maxX, 4, data);
909  break;
910  }
911  case 7: {
912  LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>),
913  num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
914  numX, maxX, 7, data);
915  break;
916  }
917  default: {
918  LAUNCH_GPU_KERNEL(
919  (EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>),
920  num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
921  kernel_size, data);
922  }
923  }
924  break;
925  }
926 
927  case 2: {
928  const int idxX = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
929  const int idxY = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
930  const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
931  const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
932 
933  const int numX = dimensions()[m_indices[idxX]];
934  const int numY = dimensions()[m_indices[idxY]];
935  const int numP = dimensions().TotalSize() / (numX * numY);
936 
937  const float scaling_factor =
938  sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
939 
940  // Snap maxX to warp size
941  int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
942  const int maxX = numext::mini<int>(inner_dim, numX);
943  const int maxY =
944  numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
945  const int maxP = numext::mini<int>(
946  maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
947 
948  dim3 block_size;
949  block_size.x = numext::mini(1024, maxX);
950  block_size.y = numext::mini<int>(1024 / block_size.x, maxY);
951  block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxP);
952 
953  const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
954  gpu_assert(shared_mem <= maxSharedMem);
955 
956  const int num_x_blocks = ceil(numX, maxX);
957  const int num_y_blocks = ceil(numY, maxY);
958  const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
959  const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
960 
961  dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
962 
963  // cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << "
964  // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y <<
965  // " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << "
966  // shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
967 
968  const array<Index, 2> indices{m_indices[idxX], m_indices[idxY]};
969  const array<Index, 2> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]};
970  internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
971  switch (kernel_size_x) {
972  case 4: {
973  switch (kernel_size_y) {
974  case 7: {
975  LAUNCH_GPU_KERNEL(
976  (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>),
977  num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
978  numY, maxY, 4, 7, data);
979  break;
980  }
981  default: {
982  LAUNCH_GPU_KERNEL(
983  (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>),
984  num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
985  numY, maxY, 4, kernel_size_y, data);
986  break;
987  }
988  }
989  break;
990  }
991  case 7: {
992  switch (kernel_size_y) {
993  case 4: {
994  LAUNCH_GPU_KERNEL(
995  (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>),
996  num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
997  numY, maxY, 7, 4, data);
998  break;
999  }
1000  default: {
1001  LAUNCH_GPU_KERNEL(
1002  (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>),
1003  num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
1004  numY, maxY, 7, kernel_size_y, data);
1005  break;
1006  }
1007  }
1008  break;
1009  }
1010  default: {
1011  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims,
1012  Dynamic, Dynamic>),
1013  num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
1014  numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
1015  break;
1016  }
1017  }
1018  break;
1019  }
1020 
1021  case 3: {
1022  const int idxX = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
1023  const int idxY = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
1024  const int idxZ = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
1025 
1026  const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1027  const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1028  const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1029 
1030  const int numX = dimensions()[m_indices[idxX]];
1031  const int numY = dimensions()[m_indices[idxY]];
1032  const int numZ = dimensions()[m_indices[idxZ]];
1033  const int numP = dimensions().TotalSize() / (numX * numY * numZ);
1034 
1035  const int maxX = numext::mini<int>(
1036  128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1,
1037  numX));
1038  const int maxY = numext::mini<int>(
1039  128, numext::mini<int>(
1040  maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1,
1041  numY));
1042  const int maxZ = numext::mini<int>(
1043  128, numext::mini<int>(
1044  maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) -
1045  kernel_size_z + 1,
1046  numZ));
1047 
1048  dim3 block_size;
1049  block_size.x = numext::mini(32, maxX);
1050  block_size.y = numext::mini(32, maxY);
1051  block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxZ);
1052  dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
1053 
1054  const int shared_mem =
1055  (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
1056  gpu_assert(shared_mem <= maxSharedMem);
1057 
1058  // cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << "
1059  // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y <<
1060  // " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() <<
1061  // endl;
1062  const array<Index, 3> indices{m_indices[idxX], m_indices[idxY], m_indices[idxZ]};
1063  const array<Index, 3> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY],
1064  m_kernelImpl.dimensions()[idxZ]};
1065  internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
1066 
1067  LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>),
1068  num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX,
1069  maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1070  break;
1071  }
1072 
1073  default: {
1074  EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
1075  THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1076  }
1077  }
1078  }
1079 
1081  eigen_assert(m_buf);
1082  eigen_assert(index < m_dimensions.TotalSize());
1083  return m_buf[index];
1084  }
1085 
1086  template <int LoadMode>
1088  eigen_assert(m_buf);
1089  eigen_assert(index < m_dimensions.TotalSize());
1090  return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
1091  }
1092 
1093  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
1094  // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
1095  // model.
1096  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1097  // We ignore the use of fused multiply-add.
1098  const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1099  const double firstIndex_compute_cost =
1100  NumDims *
1101  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
1102  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1103  kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
1104  TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
1105  }
1106 
1107  private:
1108  TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1109  TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1110  KernelArgType m_kernelArg;
1111  Indices m_indices;
1112  Dimensions m_dimensions;
1113  Scalar* m_buf;
1114  const Scalar* m_kernel;
1115  bool m_local_kernel;
1116 
1117  const GpuDevice& m_device;
1118 };
1119 #endif
1120 
1121 } // end namespace Eigen
1122 
1123 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
int i
Definition: BiCGSTAB_step_by_step.cpp:9
const unsigned n
Definition: CG3DPackingUnitTest.cpp:11
#define EIGEN_ALIGN_MAX
Definition: ConfigureVectorization.h:146
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition: Macros.h:576
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
#define EIGEN_STATIC_ASSERT(X, MSG)
Definition: StaticAssert.h:26
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:34
float * p
Definition: Tutorial_Map_using.cpp:9
SCALAR Scalar
Definition: bench_gemm.cpp:45
The tensor base class.
Definition: TensorBase.h:1026
Definition: TensorConvolution.h:236
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t< typename InputXprType::Nested > & inputExpression() const
Definition: TensorConvolution.h:253
Eigen::internal::traits< TensorConvolutionOp >::Index Index
Definition: TensorConvolution.h:244
Eigen::internal::traits< TensorConvolutionOp >::Scalar Scalar
Definition: TensorConvolution.h:238
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t< typename KernelXprType::Nested > & kernelExpression() const
Definition: TensorConvolution.h:258
Eigen::internal::traits< TensorConvolutionOp >::StorageKind StorageKind
Definition: TensorConvolution.h:243
internal::promote_storage_type< typename InputXprType::CoeffReturnType, typename KernelXprType::CoeffReturnType >::ret CoeffReturnType
Definition: TensorConvolution.h:241
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType &input, const KernelXprType &kernel, const Indices &dims)
Definition: TensorConvolution.h:246
const Indices m_indices
Definition: TensorConvolution.h:266
Eigen::internal::nested< TensorConvolutionOp >::type Nested
Definition: TensorConvolution.h:242
Eigen::NumTraits< Scalar >::Real RealScalar
Definition: TensorConvolution.h:239
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices & indices() const
Definition: TensorConvolution.h:250
InputXprType::Nested m_input_xpr
Definition: TensorConvolution.h:264
KernelXprType::Nested m_kernel_xpr
Definition: TensorConvolution.h:265
Definition: TensorEvalTo.h:61
Definition: TensorCostModel.h:28
Definition: TensorConvolution.h:28
array< Index, NumDims > m_inputStrides
Definition: TensorConvolution.h:194
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const
Definition: TensorConvolution.h:136
array< Index, NumDims > m_gpuOutputStrides
Definition: TensorConvolution.h:197
array< Index, NumDims > m_outputStrides
Definition: TensorConvolution.h:195
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const
Definition: TensorConvolution.h:182
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const
Definition: TensorConvolution.h:162
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const
Definition: TensorConvolution.h:187
IndexMapper(const InputDims &input_dims, const array< Index, NumKernelDims > &kernel_dims, const array< Index, NumKernelDims > &indices)
Definition: TensorConvolution.h:30
static constexpr int NumDims
Definition: TensorConvolution.h:193
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const
Definition: TensorConvolution.h:177
array< Index, NumDims > m_gpuInputStrides
Definition: TensorConvolution.h:196
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const
Definition: TensorConvolution.h:110
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const
Definition: TensorConvolution.h:172
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const
Definition: TensorConvolution.h:167
Definition: TensorBlock.h:566
static EIGEN_STRONG_INLINE void run(const Expression &expr, const Device &device=DefaultDevice())
Definition: TensorExecutor.h:92
dim3 threadIdx
Definition: gpu_common.h:16
dim3 blockDim
Definition: gpu_common.h:16
dim3 blockIdx
Definition: gpu_common.h:16
@ ColMajor
Definition: Constants.h:318
RealScalar s
Definition: level1_cplx_impl.h:130
return int(ret)+1
Eigen::DenseIndex ret
Definition: level1_cplx_impl.h:43
int * m
Definition: level2_cplx_impl.h:294
char char char int int * k
Definition: level2_impl.h:374
char char * op
Definition: level2_impl.h:374
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16 &a)
Definition: BFloat16.h:644
typename remove_all< T >::type remove_all_t
Definition: Meta.h:142
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition: MathFunctions.h:920
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
std::array< T, N > array
Definition: EmulateArray.h:231
squared absolute value
Definition: GlobalFunctions.h:87
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
const int Dynamic
Definition: Constants.h:25
Extend namespace for flags.
Definition: fsi_chan_precond_driver.cc:56
val
Definition: calibrate.py:119
type
Definition: compute_granudrum_aor.py:141
in_place
Definition: fix_broken_doxygen_formulae.py:249
Definition: Eigen_Colamd.h:49
CwiseBinaryOp< internal::scalar_sum_op< double, double >, const CpyMatrixXd, const CpyMatrixXd > XprType
Definition: nestbyvalue.cpp:15
internal::nested_eval< T, 1 >::type eval(const T &xpr)
Definition: sparse_permutations.cpp:47
Definition: Constants.h:519
T Real
Definition: NumTraits.h:183
Definition: TensorMeta.h:47
internal::packet_traits< Scalar >::type type
Definition: TensorMeta.h:48
Definition: TensorForwardDeclarations.h:42
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition: TensorConvolution.h:394
TensorEvaluator< InputArgType, Device > m_inputImpl
Definition: TensorConvolution.h:529
EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType &accum) const
Definition: TensorConvolution.h:478
EIGEN_STRONG_INLINE TensorEvaluator(const XprType &op, const Device &device)
Definition: TensorConvolution.h:302
EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
Definition: TensorConvolution.h:401
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition: TensorConvolution.h:281
EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet &accum) const
Definition: TensorConvolution.h:491
array< Index, NumKernelDims > m_kernelStride
Definition: TensorConvolution.h:528
internal::TensorBlockNotImplemented TensorBlock
Definition: TensorConvolution.h:299
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
Definition: TensorConvolution.h:444
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const
Definition: TensorConvolution.h:459
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel()
Definition: TensorConvolution.h:504
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition: TensorConvolution.h:456
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition: TensorConvolution.h:370
TensorEvaluator< KernelArgType, Device > m_kernelImpl
Definition: TensorConvolution.h:530
StorageMemory< Scalar, Device > Storage
Definition: TensorConvolution.h:283
array< Index, NumKernelDims > m_indexStride
Definition: TensorConvolution.h:527
TensorConvolutionOp< Indices, InputArgType, KernelArgType > XprType
Definition: TensorConvolution.h:271
void evalTo(typename XprType::Scalar *buffer)
Definition: TensorConvolution.h:386
const Device EIGEN_DEVICE_REF m_device
Definition: TensorConvolution.h:536
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar *)
Definition: TensorConvolution.h:372
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31
static constexpr int Layout
Definition: TensorEvaluator.h:46
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest)
Definition: TensorEvaluator.h:71
Derived::Scalar Scalar
Definition: TensorEvaluator.h:33
const Device EIGEN_DEVICE_REF m_device
Definition: TensorEvaluator.h:170
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived &m, const Device &device)
Definition: TensorEvaluator.h:66
@ PacketAccess
Definition: TensorEvaluator.h:50
@ IsAligned
Definition: TensorEvaluator.h:49
static constexpr int PacketSize
Definition: TensorEvaluator.h:38
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition: TensorEvaluator.h:165
Derived::Scalar CoeffReturnType
Definition: TensorEvaluator.h:34
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition: TensorEvaluator.h:89
EIGEN_STRONG_INLINE void cleanup()
Definition: TensorEvaluator.h:87
Derived XprType
Definition: TensorEvaluator.h:37
Derived::Index Index
Definition: TensorEvaluator.h:32
internal::TensorMaterializedBlock< ScalarNoConst, NumCoords, Layout, Index > TensorBlock
Definition: TensorEvaluator.h:63
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition: TensorEvaluator.h:35
Derived::Dimensions Dimensions
Definition: TensorEvaluator.h:36
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
Definition: TensorEvaluator.h:139
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition: TensorEvaluator.h:69
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
Definition: TensorEvaluator.h:100
Definition: TensorForwardDeclarations.h:175
static const bool value
Definition: TensorForwardDeclarations.h:176
Definition: Meta.h:305
static constexpr Index value
Definition: Meta.h:306
const TensorConvolutionOp< Dimensions, InputXprType, KernelXprType > & type
Definition: TensorConvolution.h:223
Definition: XprHelper.h:427
Definition: TensorTraits.h:152
ref_selector< T >::type type
Definition: TensorTraits.h:153
Definition: XprHelper.h:145
Definition: XprHelper.h:591
promote_storage_type< typename traits< InputXprType >::StorageKind, typename traits< KernelXprType >::StorageKind >::ret StorageKind
Definition: TensorConvolution.h:205
promote_index_type< typename traits< InputXprType >::Index, typename traits< KernelXprType >::Index >::type Index
Definition: TensorConvolution.h:207
std::conditional_t< Pointer_type_promotion< typename InputXprType::Scalar, Scalar >::val, typename traits< InputXprType >::PointerType, typename traits< KernelXprType >::PointerType > PointerType
Definition: TensorConvolution.h:216
promote_storage_type< typename InputXprType::Scalar, typename KernelXprType::Scalar >::ret Scalar
Definition: TensorConvolution.h:203
std::remove_reference_t< RhsNested > RhsNested_
Definition: TensorConvolution.h:211
std::remove_reference_t< LhsNested > LhsNested_
Definition: TensorConvolution.h:210
Definition: ForwardDeclarations.h:21
@ size
Definition: GenericPacketMath.h:139
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2