10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
27 template <
typename Index,
typename InputDims,
int NumKernelDims,
int Layout>
33 for (
int i = 0;
i < NumKernelDims; ++
i) {
34 const Index index = indices[
i];
35 const Index input_dim = input_dims[index];
36 const Index kernel_dim = kernel_dims[
i];
37 const Index result_dim = input_dim - kernel_dim + 1;
38 dimensions[index] = result_dim;
43 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
47 inputStrides[
i] = inputStrides[
i - 1] * input_dims[
i - 1];
48 outputStrides[
i] = outputStrides[
i - 1] * dimensions[
i - 1];
53 for (
int i =
static_cast<int>(
NumDims) - 2;
i >= 0; --
i) {
54 inputStrides[
i] = inputStrides[
i + 1] * input_dims[
i + 1];
55 outputStrides[
i] = outputStrides[
i + 1] * dimensions[
i + 1];
63 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 :
NumDims - NumKernelDims;
64 for (
int i = 0;
i < NumKernelDims; ++
i) {
65 const Index index =
i + offset;
66 ordering[index] = indices[
i];
68 gpuInputDimensions[index] = input_dims[indices[
i]];
69 gpuOutputDimensions[index] = dimensions[indices[
i]];
72 int written =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? NumKernelDims : 0;
75 ordering[written] =
i;
76 gpuInputDimensions[written] = input_dims[
i];
77 gpuOutputDimensions[written] = dimensions[
i];
87 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
89 if (
i > NumKernelDims) {
99 if (
static_cast<size_t>(
i + 1) < offset) {
111 Index inputIndex = 0;
112 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
113 for (
int d =
NumDims - 1; d > NumKernelDims; --d) {
122 std::ptrdiff_t limit = 0;
124 limit =
NumDims - NumKernelDims - 1;
126 for (
int d = 0; d < limit; ++d) {
137 Index outputIndex = 0;
138 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
139 for (
int d =
NumDims - 1; d > NumKernelDims; --d) {
148 std::ptrdiff_t limit = 0;
150 limit =
NumDims - NumKernelDims - 1;
152 for (
int d = 0; d < limit; ++d) {
163 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 :
NumDims - NumKernelDims;
168 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 :
NumDims - NumKernelDims;
173 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 :
NumDims - NumKernelDims;
178 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 :
NumDims - NumKernelDims;
183 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 :
NumDims - NumKernelDims;
188 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 :
NumDims - NumKernelDims;
200 template <
typename Dimensions,
typename InputXprType,
typename KernelXprType>
221 template <
typename Dimensions,
typename InputXprType,
typename KernelXprType>
226 template <
typename Dimensions,
typename InputXprType,
typename KernelXprType>
234 template <
typename Indices,
typename InputXprType,
typename KernelXprType>
236 :
public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors> {
269 template <
typename Indices,
typename InputArgType,
typename KernelArgType,
typename Device>
273 static constexpr
int NumDims =
293 PreferBlockAccess =
false,
303 : m_inputImpl(
op.inputExpression(), device),
304 m_kernelImpl(
op.kernelExpression(), device),
305 m_kernelArg(
op.kernelExpression()),
307 m_local_kernel(false),
311 YOU_MADE_A_PROGRAMMING_MISTAKE);
317 m_inputStride[0] = 1;
318 for (
int i = 1;
i < NumDims; ++
i) {
319 m_inputStride[
i] = m_inputStride[
i - 1] * input_dims[
i - 1];
322 m_inputStride[NumDims - 1] = 1;
323 for (
int i = NumDims - 2;
i >= 0; --
i) {
324 m_inputStride[
i] = m_inputStride[
i + 1] * input_dims[
i + 1];
330 for (
int i = 0;
i < NumKernelDims; ++
i) {
331 const Index index =
op.indices()[
i];
332 const Index input_dim = input_dims[index];
333 const Index kernel_dim = kernel_dims[
i];
334 const Index result_dim = input_dim - kernel_dim + 1;
335 m_dimensions[index] = result_dim;
337 m_kernelStride[
i] = m_kernelStride[
i - 1] * kernel_dims[
i - 1];
339 m_kernelStride[0] = 1;
341 m_indexStride[
i] = m_inputStride[index];
344 m_outputStride[0] = 1;
345 for (
int i = 1;
i < NumDims; ++
i) {
346 m_outputStride[
i] = m_outputStride[
i - 1] * m_dimensions[
i - 1];
349 for (
int i = NumKernelDims - 1;
i >= 0; --
i) {
350 const Index index =
op.indices()[
i];
351 const Index input_dim = input_dims[index];
352 const Index kernel_dim = kernel_dims[
i];
353 const Index result_dim = input_dim - kernel_dim + 1;
354 m_dimensions[index] = result_dim;
355 if (
i < NumKernelDims - 1) {
356 m_kernelStride[
i] = m_kernelStride[
i + 1] * kernel_dims[
i + 1];
358 m_kernelStride[NumKernelDims - 1] = 1;
360 m_indexStride[
i] = m_inputStride[index];
363 m_outputStride[NumDims - 1] = 1;
364 for (
int i = NumDims - 2;
i >= 0; --
i) {
365 m_outputStride[
i] = m_outputStride[
i + 1] * m_dimensions[
i + 1];
373 m_inputImpl.evalSubExprsIfNeeded(NULL);
378 m_inputImpl.cleanup();
379 if (m_local_kernel) {
380 m_device.deallocate((
void*)m_kernel);
381 m_local_kernel =
false;
396 convolve(firstInput(index), 0, NumKernelDims - 1, result);
400 template <
int LoadMode>
403 Index startInputs[2] = {0, 0};
405 for (
int i = NumDims - 1;
i > 0; --
i) {
406 const Index idx0 = indices[0] / m_outputStride[
i];
407 const Index idx1 = indices[1] / m_outputStride[
i];
408 startInputs[0] += idx0 * m_inputStride[
i];
409 startInputs[1] += idx1 * m_inputStride[
i];
410 indices[0] -= idx0 * m_outputStride[
i];
411 indices[1] -= idx1 * m_outputStride[
i];
414 for (
int i = 0;
i < NumDims - 1; ++
i) {
415 const Index idx0 = indices[0] / m_outputStride[
i];
416 const Index idx1 = indices[1] / m_outputStride[
i];
417 startInputs[0] += idx0 * m_inputStride[
i];
418 startInputs[1] += idx1 * m_inputStride[
i];
419 indices[0] -= idx0 * m_outputStride[
i];
420 indices[1] -= idx1 * m_outputStride[
i];
423 startInputs[0] += indices[0];
424 startInputs[1] += indices[1];
426 if (startInputs[1] - startInputs[0] ==
PacketSize - 1) {
428 convolvePacket(startInputs[0], 0, NumKernelDims - 1, result);
433 convolve(startInputs[0], 0, NumKernelDims - 1,
data[0]);
436 convolve(firstInput(index +
i), 0, NumKernelDims - 1,
data[
i]);
439 convolve(startInputs[1], 0, NumKernelDims - 1,
data[
PacketSize - 1]);
440 return internal::pload<PacketReturnType>(
data);
445 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
447 const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
448 const double firstIndex_compute_cost =
450 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
452 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
460 Index startInput = 0;
462 for (
int i = NumDims - 1;
i > 0; --
i) {
463 const Index idx = index / m_outputStride[
i];
464 startInput += idx * m_inputStride[
i];
465 index -= idx * m_outputStride[
i];
468 for (
int i = 0;
i < NumDims - 1; ++
i) {
469 const Index idx = index / m_outputStride[
i];
470 startInput += idx * m_inputStride[
i];
471 index -= idx * m_outputStride[
i];
479 for (
int j = 0;
j < m_kernelImpl.dimensions()[DimIndex]; ++
j) {
480 const Index input = firstIndex +
j * m_indexStride[DimIndex];
481 const Index kernel = firstKernel +
j * m_kernelStride[DimIndex];
483 convolve(input, kernel, DimIndex - 1, accum);
485 accum += m_inputImpl.coeff(input) * m_kernel[kernel];
490 template <
typename Packet>
492 for (
int j = 0;
j < m_kernelImpl.dimensions()[DimIndex]; ++
j) {
493 const Index input = firstIndex +
j * m_indexStride[DimIndex];
494 const Index kernel = firstKernel +
j * m_kernelStride[DimIndex];
496 convolvePacket(input, kernel, DimIndex - 1, accum);
498 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input),
499 internal::pset1<Packet>(m_kernel[kernel]), accum);
510 m_local_kernel =
false;
512 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(
Scalar);
515 EvalTo evalToTmp(local, m_kernelArg);
520 m_local_kernel =
true;
540 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
542 template <
int StaticKernelSize>
543 struct GetKernelSize {
547 struct GetKernelSize<
Dynamic> {
551 template <
typename InputEvaluator,
typename Index,
typename InputDims,
int StaticKernelSize>
553 InputEvaluator
eval,
const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> indexMapper,
554 const float* __restrict kernel,
const int numPlanes,
const int numX,
const int maxX,
const int kernelSize,
556 #if defined(EIGEN_HIPCC)
557 HIP_DYNAMIC_SHARED(
float,
s)
559 extern __shared__
float s[];
562 const int first_x =
blockIdx.x * maxX;
563 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
564 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
565 const int num_x_output = last_x - first_x + 1;
568 const int plane_stride =
blockDim.y * gridDim.y;
570 for (
int p = first_plane +
threadIdx.y;
p < numPlanes;
p += plane_stride) {
572 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(
p);
573 const int plane_kernel_offset =
threadIdx.y * num_x_input;
576 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
i + first_x);
577 s[
i + plane_kernel_offset] =
eval.coeff(tensor_index);
583 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(
p);
587 const int kernel_offset = plane_kernel_offset +
i;
590 for (
int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++
k) {
591 result +=
s[
k + kernel_offset] * kernel[
k];
593 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(
i + first_x);
594 buffer[tensor_index] = result;
600 template <
typename InputEvaluator,
typename Index,
typename InputDims,
int StaticKernelSizeX,
int StaticKernelSizeY>
602 InputEvaluator
eval,
const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> indexMapper,
603 const float* __restrict kernel,
const int numPlanes,
const int numX,
const int maxX,
const int numY,
const int maxY,
604 const int kernelSizeX,
const int kernelSizeY,
float* buffer) {
605 #if defined(EIGEN_HIPCC)
606 HIP_DYNAMIC_SHARED(
float,
s)
608 extern __shared__
float s[];
611 const int first_x =
blockIdx.x * maxX;
612 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
613 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
614 const int num_x_output = last_x - first_x + 1;
616 const int first_y =
blockIdx.y * maxY;
617 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
618 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
619 const int num_y_output = last_y - first_y + 1;
622 const int plane_stride =
blockDim.z * gridDim.z;
624 for (
int p = first_plane +
threadIdx.z;
p < numPlanes;
p += plane_stride) {
625 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(
p);
626 const int plane_kernel_offset =
threadIdx.z * num_y_input;
631 const int input_offset = num_x_input * (
j + plane_kernel_offset);
634 const int tensor_index =
635 plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
i + first_x,
j + first_y);
636 s[
i + input_offset] =
eval.coeff(tensor_index);
643 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(
p);
651 for (
int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
652 const int kernel_offset = kernelSizeX * l;
653 const int input_offset =
i + num_x_input * (
j + l + plane_kernel_offset);
655 for (
int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++
k) {
656 result +=
s[
k + input_offset] * kernel[
k + kernel_offset];
659 const int tensor_index =
660 plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(
i + first_x,
j + first_y);
661 buffer[tensor_index] = result;
669 template <
typename InputEvaluator,
typename Index,
typename InputDims>
671 InputEvaluator
eval,
const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> indexMapper,
672 const float* __restrict kernel,
const size_t numPlanes,
const size_t numX,
const size_t maxX,
const size_t numY,
673 const size_t maxY,
const size_t numZ,
const size_t maxZ,
const size_t kernelSizeX,
const size_t kernelSizeY,
674 const size_t kernelSizeZ,
float* buffer) {
675 #if defined(EIGEN_HIPCC)
676 HIP_DYNAMIC_SHARED(
float,
s)
678 extern __shared__
float s[];
682 const int first_x =
blockIdx.x * maxX;
683 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
684 const int num_x_input = last_x - first_x + kernelSizeX;
686 const int first_y =
blockIdx.y * maxY;
687 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
688 const int num_y_input = last_y - first_y + kernelSizeY;
690 const int first_z =
blockIdx.z * maxZ;
691 const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
692 const int num_z_input = last_z - first_z + kernelSizeZ;
694 for (
int p = 0;
p < numPlanes; ++
p) {
695 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(
p);
696 const int plane_kernel_offset = 0;
701 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
702 i + first_x,
j + first_y,
k + first_z);
703 s[
i + num_x_input * (
j + num_y_input * (
k + plane_kernel_offset))] =
eval.coeff(tensor_index);
711 const int num_z_output = last_z - first_z + 1;
712 const int num_y_output = last_y - first_y + 1;
713 const int num_x_output = last_x - first_x + 1;
714 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(
p);
720 for (
int n = 0;
n < kernelSizeZ; ++
n) {
721 for (
int m = 0;
m < kernelSizeY; ++
m) {
722 for (
int l = 0; l < kernelSizeX; ++l) {
723 result +=
s[
i + l + num_x_input * (
j +
m + num_y_input * (
k +
n + plane_kernel_offset))] *
724 kernel[l + kernelSizeX * (
m + kernelSizeY *
n)];
728 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(
729 i + first_x,
j + first_y,
k + first_z);
730 buffer[tensor_index] = result;
738 template <
typename Indices,
typename InputArgType,
typename KernelArgType>
739 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice> {
740 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType>
XprType;
742 static constexpr
int NumDims =
743 internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>
::value;
755 PreferBlockAccess =
false,
761 typedef internal::TensorBlockNotImplemented
TensorBlock;
765 : m_inputImpl(
op.inputExpression(), device),
766 m_kernelImpl(
op.kernelExpression(), device),
767 m_kernelArg(
op.kernelExpression()),
768 m_indices(
op.indices()),
771 m_local_kernel(false),
775 YOU_MADE_A_PROGRAMMING_MISTAKE);
780 m_dimensions = m_inputImpl.dimensions();
781 for (
int i = 0;
i < NumKernelDims; ++
i) {
782 const Index index =
op.indices()[
i];
783 const Index input_dim = input_dims[index];
784 const Index kernel_dim = kernel_dims[
i];
785 const Index result_dim = input_dim - kernel_dim + 1;
786 m_dimensions[index] = result_dim;
799 m_inputImpl.evalSubExprsIfNeeded(NULL);
811 m_inputImpl.cleanup();
816 if (m_local_kernel) {
817 m_device.deallocate((
void*)m_kernel);
818 m_local_kernel =
false;
829 m_local_kernel =
false;
831 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(
Scalar);
833 typedef TensorEvalToOp<const KernelArgType> EvalTo;
834 EvalTo evalToTmp(local, m_kernelArg);
839 m_local_kernel =
true;
843 static unsigned int ceil(
unsigned int num,
unsigned int denom) {
844 const unsigned int rounded_toward_zero = num / denom;
845 if (num > rounded_toward_zero * denom) {
846 return rounded_toward_zero + 1;
848 return rounded_toward_zero;
854 const int maxSharedMem =
m_device.sharedMemPerBlock();
855 const int maxThreadsPerBlock =
m_device.maxGpuThreadsPerBlock();
856 const int maxBlocksPerProcessor =
m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
857 const int numMultiProcessors =
m_device.getNumGpuMultiProcessors();
858 const int warpSize = 32;
860 switch (NumKernelDims) {
862 const int kernel_size = m_kernelImpl.dimensions().TotalSize();
865 const int numP =
dimensions().TotalSize() / numX;
869 const int single_stride_dim =
870 static_cast<int>(
Layout) ==
static_cast<int>(
ColMajor) ? 0 : m_inputImpl.dimensions().rank() - 1;
871 if (m_indices[0] == single_stride_dim) {
873 const int inner_dim = ((maxSharedMem / (
sizeof(
Scalar)) - kernel_size + 1 + 31) / 32) * 32;
874 maxX = numext::mini<int>(inner_dim, numX);
875 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) *
sizeof(
Scalar)), numP);
877 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
880 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) *
sizeof(
Scalar));
881 const int maxP = numext::mini<int>(inner_dim, numP);
882 maxX = numext::mini<int>(maxSharedMem / (inner_dim *
sizeof(
Scalar)) - kernel_size + 1, numX);
885 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
888 const int shared_mem = block_size.y * (maxX + kernel_size - 1) *
sizeof(
Scalar);
889 gpu_assert(shared_mem <= maxSharedMem);
891 const int num_x_blocks =
ceil(numX, maxX);
892 const int blocksPerProcessor =
numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
893 const int num_y_blocks =
ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
895 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks,
ceil(numP, block_size.y)));
901 const array<Index, 1> indices{m_indices[0]};
902 const array<Index, 1> kernel_dims{m_kernelImpl.dimensions()[0]};
903 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
904 switch (kernel_size) {
906 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 4>),
907 num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP,
908 numX, maxX, 4,
data);
912 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 7>),
913 num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP,
914 numX, maxX, 7,
data);
919 (EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims,
Dynamic>),
920 num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
928 const int idxX =
static_cast<int>(
Layout) ==
static_cast<int>(
ColMajor) ? 0 : 1;
929 const int idxY =
static_cast<int>(
Layout) ==
static_cast<int>(
ColMajor) ? 1 : 0;
930 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
931 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
933 const int numX =
dimensions()[m_indices[idxX]];
934 const int numY =
dimensions()[m_indices[idxY]];
935 const int numP =
dimensions().TotalSize() / (numX * numY);
937 const float scaling_factor =
938 sqrtf(
static_cast<float>(maxSharedMem) / (
sizeof(
Scalar) * kernel_size_y * kernel_size_x));
941 int inner_dim = ((
static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
942 const int maxX = numext::mini<int>(inner_dim, numX);
944 numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
945 const int maxP = numext::mini<int>(
946 maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) *
sizeof(
Scalar)), numP);
950 block_size.y = numext::mini<int>(1024 / block_size.x, maxY);
951 block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxP);
953 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) *
sizeof(
Scalar);
954 gpu_assert(shared_mem <= maxSharedMem);
956 const int num_x_blocks =
ceil(numX, maxX);
957 const int num_y_blocks =
ceil(numY, maxY);
958 const int blocksPerProcessor =
numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
959 const int num_z_blocks =
ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
961 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks,
ceil(numP, block_size.z)));
968 const array<Index, 2> indices{m_indices[idxX], m_indices[idxY]};
969 const array<Index, 2> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]};
970 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
971 switch (kernel_size_x) {
973 switch (kernel_size_y) {
976 (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 4, 7>),
977 num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
978 numY, maxY, 4, 7,
data);
983 (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 4,
Dynamic>),
984 num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
985 numY, maxY, 4, kernel_size_y,
data);
992 switch (kernel_size_y) {
995 (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 7, 4>),
996 num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
997 numY, maxY, 7, 4,
data);
1002 (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 7,
Dynamic>),
1003 num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
1004 numY, maxY, 7, kernel_size_y,
data);
1011 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims,
1013 num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP,
1014 numX, maxX, numY, maxY, kernel_size_x, kernel_size_y,
data);
1022 const int idxX =
static_cast<int>(
Layout) ==
static_cast<int>(
ColMajor) ? 0 : 2;
1023 const int idxY =
static_cast<int>(
Layout) ==
static_cast<int>(
ColMajor) ? 1 : 1;
1024 const int idxZ =
static_cast<int>(
Layout) ==
static_cast<int>(
ColMajor) ? 2 : 0;
1026 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1027 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1028 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1030 const int numX =
dimensions()[m_indices[idxX]];
1031 const int numY =
dimensions()[m_indices[idxY]];
1032 const int numZ =
dimensions()[m_indices[idxZ]];
1033 const int numP =
dimensions().TotalSize() / (numX * numY * numZ);
1035 const int maxX = numext::mini<int>(
1036 128, numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1,
1038 const int maxY = numext::mini<int>(
1039 128, numext::mini<int>(
1040 maxSharedMem / (
sizeof(
Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1,
1042 const int maxZ = numext::mini<int>(
1043 128, numext::mini<int>(
1044 maxSharedMem / (
sizeof(
Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) -
1051 block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxZ);
1052 dim3 num_blocks(
ceil(numX, maxX),
ceil(numY, maxY),
ceil(numZ, maxZ));
1054 const int shared_mem =
1055 (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) *
sizeof(
Scalar);
1056 gpu_assert(shared_mem <= maxSharedMem);
1062 const array<Index, 3> indices{m_indices[idxX], m_indices[idxY], m_indices[idxZ]};
1063 const array<Index, 3> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY],
1064 m_kernelImpl.dimensions()[idxZ]};
1065 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
1067 LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims>),
1068 num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX,
1069 maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z,
data);
1075 THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1083 return m_buf[index];
1086 template <
int LoadMode>
1090 return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
1096 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1098 const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1099 const double firstIndex_compute_cost =
1101 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
1102 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized,
PacketSize) +
1103 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
1104 TensorOpCost(0, 0, convolve_compute_cost, vectorized,
PacketSize));
1108 TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1109 TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1110 KernelArgType m_kernelArg;
1115 bool m_local_kernel;
int i
Definition: BiCGSTAB_step_by_step.cpp:9
const unsigned n
Definition: CG3DPackingUnitTest.cpp:11
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition: Macros.h:576
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
#define EIGEN_STATIC_ASSERT(X, MSG)
Definition: StaticAssert.h:26
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:34
float * p
Definition: Tutorial_Map_using.cpp:9
SCALAR Scalar
Definition: bench_gemm.cpp:45
The tensor base class.
Definition: TensorBase.h:1026
Definition: TensorConvolution.h:236
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t< typename InputXprType::Nested > & inputExpression() const
Definition: TensorConvolution.h:253
Eigen::internal::traits< TensorConvolutionOp >::Index Index
Definition: TensorConvolution.h:244
Eigen::internal::traits< TensorConvolutionOp >::Scalar Scalar
Definition: TensorConvolution.h:238
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t< typename KernelXprType::Nested > & kernelExpression() const
Definition: TensorConvolution.h:258
Eigen::internal::traits< TensorConvolutionOp >::StorageKind StorageKind
Definition: TensorConvolution.h:243
internal::promote_storage_type< typename InputXprType::CoeffReturnType, typename KernelXprType::CoeffReturnType >::ret CoeffReturnType
Definition: TensorConvolution.h:241
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType &input, const KernelXprType &kernel, const Indices &dims)
Definition: TensorConvolution.h:246
const Indices m_indices
Definition: TensorConvolution.h:266
Eigen::internal::nested< TensorConvolutionOp >::type Nested
Definition: TensorConvolution.h:242
Eigen::NumTraits< Scalar >::Real RealScalar
Definition: TensorConvolution.h:239
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices & indices() const
Definition: TensorConvolution.h:250
InputXprType::Nested m_input_xpr
Definition: TensorConvolution.h:264
KernelXprType::Nested m_kernel_xpr
Definition: TensorConvolution.h:265
Definition: TensorEvalTo.h:61
Definition: TensorCostModel.h:28
Definition: TensorConvolution.h:28
array< Index, NumDims > m_inputStrides
Definition: TensorConvolution.h:194
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const
Definition: TensorConvolution.h:136
array< Index, NumDims > m_gpuOutputStrides
Definition: TensorConvolution.h:197
array< Index, NumDims > m_outputStrides
Definition: TensorConvolution.h:195
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const
Definition: TensorConvolution.h:182
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const
Definition: TensorConvolution.h:162
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const
Definition: TensorConvolution.h:187
IndexMapper(const InputDims &input_dims, const array< Index, NumKernelDims > &kernel_dims, const array< Index, NumKernelDims > &indices)
Definition: TensorConvolution.h:30
static constexpr int NumDims
Definition: TensorConvolution.h:193
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const
Definition: TensorConvolution.h:177
array< Index, NumDims > m_gpuInputStrides
Definition: TensorConvolution.h:196
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const
Definition: TensorConvolution.h:110
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const
Definition: TensorConvolution.h:172
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const
Definition: TensorConvolution.h:167
Definition: TensorBlock.h:566
static EIGEN_STRONG_INLINE void run(const Expression &expr, const Device &device=DefaultDevice())
Definition: TensorExecutor.h:92
dim3 threadIdx
Definition: gpu_common.h:16
dim3 blockDim
Definition: gpu_common.h:16
dim3 blockIdx
Definition: gpu_common.h:16
@ ColMajor
Definition: Constants.h:318
RealScalar s
Definition: level1_cplx_impl.h:130
Eigen::DenseIndex ret
Definition: level1_cplx_impl.h:43
int * m
Definition: level2_cplx_impl.h:294
char char char int int * k
Definition: level2_impl.h:374
char char * op
Definition: level2_impl.h:374
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16 &a)
Definition: BFloat16.h:644
typename remove_all< T >::type remove_all_t
Definition: Meta.h:142
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition: MathFunctions.h:920
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
std::array< T, N > array
Definition: EmulateArray.h:231
squared absolute value
Definition: GlobalFunctions.h:87
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
const int Dynamic
Definition: Constants.h:25
Extend namespace for flags.
Definition: fsi_chan_precond_driver.cc:56
val
Definition: calibrate.py:119
type
Definition: compute_granudrum_aor.py:141
Definition: Eigen_Colamd.h:49
CwiseBinaryOp< internal::scalar_sum_op< double, double >, const CpyMatrixXd, const CpyMatrixXd > XprType
Definition: nestbyvalue.cpp:15
internal::nested_eval< T, 1 >::type eval(const T &xpr)
Definition: sparse_permutations.cpp:47
Definition: Constants.h:519
T Real
Definition: NumTraits.h:183
Definition: TensorMeta.h:47
internal::packet_traits< Scalar >::type type
Definition: TensorMeta.h:48
Definition: TensorForwardDeclarations.h:42
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31
static constexpr int Layout
Definition: TensorEvaluator.h:46
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest)
Definition: TensorEvaluator.h:71
Derived::Scalar Scalar
Definition: TensorEvaluator.h:33
const Device EIGEN_DEVICE_REF m_device
Definition: TensorEvaluator.h:170
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived &m, const Device &device)
Definition: TensorEvaluator.h:66
@ PacketAccess
Definition: TensorEvaluator.h:50
@ IsAligned
Definition: TensorEvaluator.h:49
static constexpr int PacketSize
Definition: TensorEvaluator.h:38
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition: TensorEvaluator.h:165
Derived::Scalar CoeffReturnType
Definition: TensorEvaluator.h:34
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition: TensorEvaluator.h:89
EIGEN_STRONG_INLINE void cleanup()
Definition: TensorEvaluator.h:87
Derived XprType
Definition: TensorEvaluator.h:37
Derived::Index Index
Definition: TensorEvaluator.h:32
internal::TensorMaterializedBlock< ScalarNoConst, NumCoords, Layout, Index > TensorBlock
Definition: TensorEvaluator.h:63
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition: TensorEvaluator.h:35
Derived::Dimensions Dimensions
Definition: TensorEvaluator.h:36
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
Definition: TensorEvaluator.h:139
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition: TensorEvaluator.h:69
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
Definition: TensorEvaluator.h:100
Definition: TensorForwardDeclarations.h:175
static const bool value
Definition: TensorForwardDeclarations.h:176
static constexpr Index value
Definition: Meta.h:306
Definition: XprHelper.h:427
Definition: TensorTraits.h:152
ref_selector< T >::type type
Definition: TensorTraits.h:153
Definition: ForwardDeclarations.h:21
@ size
Definition: GenericPacketMath.h:139
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2