15 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
16 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
32 template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
33 typename Kernel_accessor,
typename Buffer_accessor,
convolution_type Conv_Dim>
35 template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
36 typename Kernel_accessor,
typename Buffer_accessor>
39 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
49 Buffer_accessor buffer_acc_,
51 const size_t kernelSize_,
const cl::sycl::range<2> input_range_)
52 : local_acc(local_acc_),
53 device_evaluator(device_evaluator_),
54 kernel_filter(kernel_filter_),
55 buffer_acc(buffer_acc_),
56 indexMapper(indexMapper_),
57 kernelSize(kernelSize_),
58 input_range(input_range_) {}
60 template <
typename BooleanDim2>
62 return (boolean_check[0] && boolean_check[1]);
65 auto buffer_ptr = buffer_acc;
66 auto kernel_ptr = kernel_filter;
68 const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
69 const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
70 const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
73 for (
size_t i = itemID.get_local_id(0);
i < num_input;
i += itemID.get_local_range()[0]) {
74 const size_t local_index =
i + plane_kernel_offset;
75 const size_t tensor_index =
78 local_acc[local_index] =
79 (((
i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
80 ? device_evaluator.coeff(tensor_index)
84 itemID.barrier(cl::sycl::access::fence_space::local_space);
87 const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
88 if (boundary_check(itemID.get_global_id() < input_range)) {
89 CoeffReturnType result =
static_cast<CoeffReturnType
>(0);
90 const size_t index = plane_kernel_offset + itemID.get_local_id(0);
91 for (
size_t k = 0;
k < kernelSize; ++
k) {
92 result += (local_acc[
k + index] * kernel_ptr[
k]);
94 const size_t tensor_index =
97 buffer_ptr[tensor_index] = result;
102 template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
103 typename Kernel_accessor,
typename Buffer_accessor>
106 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
116 Buffer_accessor buffer_acc_,
118 const cl::sycl::range<2> kernel_size_,
const cl::sycl::range<3> input_range_)
119 : local_acc(local_acc_),
120 device_evaluator(device_evaluator_),
121 kernel_filter(kernel_filter_),
122 buffer_acc(buffer_acc_),
123 indexMapper(indexMapper_),
124 kernel_size(kernel_size_),
125 input_range(input_range_) {}
126 template <
typename BooleanDim3>
128 return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
132 auto buffer_ptr = buffer_acc;
133 auto kernel_ptr = kernel_filter;
135 const auto num_input = cl::sycl::range<2>{
136 (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
139 const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
141 const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
142 itemID.get_group(1) * itemID.get_local_range()[1]};
145 bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
146 for (
size_t j = itemID.get_local_id(1);
j < num_input[1];
j += itemID.get_local_range()[1]) {
147 const size_t local_input_offset = num_input[0] * (
j + plane_kernel_offset);
148 bool in_range_dim1 = ((
j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
149 for (
size_t i = itemID.get_local_id(0);
i < num_input[0];
i += itemID.get_local_range()[0]) {
150 const size_t local_index =
i + local_input_offset;
152 i + input_offset[0],
j + input_offset[1]);
153 local_acc[local_index] =
154 (((
i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) && in_range_dim1 && in_range_dim2)
155 ? device_evaluator.coeff(tensor_index)
156 : CoeffReturnType(0);
160 itemID.barrier(cl::sycl::access::fence_space::local_space);
163 const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
164 itemID.get_group(1) * itemID.get_local_range()[1]};
166 if (boundary_check(itemID.get_global_id() < input_range)) {
167 CoeffReturnType result =
static_cast<CoeffReturnType
>(0);
169 for (
size_t j = 0;
j < kernel_size[1];
j++) {
170 size_t kernel_offset = kernel_size[0] *
j;
172 (num_input[0] * (plane_kernel_offset +
j + itemID.get_local_id(1))) + itemID.get_local_id(0);
173 for (
size_t i = 0;
i < kernel_size[0];
i++) {
174 result += (local_acc[
i + index] * kernel_ptr[
i + kernel_offset]);
177 const size_t tensor_index =
180 itemID.get_local_id(1) + output_offset[1]);
182 buffer_ptr[tensor_index] = result;
187 template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
188 typename Kernel_accessor,
typename Buffer_accessor>
191 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
203 Buffer_accessor buffer_acc_,
205 const cl::sycl::range<3> kernel_size_,
const cl::sycl::range<3> input_range_,
207 : local_acc(local_acc_),
208 device_evaluator(device_evaluator_),
209 kernel_filter(kernel_filter_),
210 buffer_acc(buffer_acc_),
211 indexMapper(indexMapper_),
212 kernel_size(kernel_size_),
213 input_range(input_range_),
215 template <
typename BooleanDim3>
217 return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
220 auto buffer_ptr = buffer_acc;
221 auto kernel_ptr = kernel_filter;
222 const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
224 const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
226 const auto output_offset =
227 cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
229 for (
size_t p = 0;
p < numP;
p++) {
232 for (
size_t k = itemID.get_local_id(2);
k < num_input[2];
k += itemID.get_local_range()[2]) {
233 size_t local_index_dim2 = num_input[0] * num_input[1] *
k;
234 bool cond_k_dim = (
k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
235 for (
size_t j = itemID.get_local_id(1);
j < num_input[1];
j += itemID.get_local_range()[1]) {
236 bool cond_j_dim = cond_k_dim && (
j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
237 size_t local_index_dim1 = (num_input[0] *
j) + local_index_dim2;
238 for (
size_t i = itemID.get_local_id(0);
i < num_input[0];
i += itemID.get_local_range()[0]) {
239 bool conds = cond_j_dim && (
i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
240 const size_t local_index = local_index_dim1 +
i;
241 const size_t tensor_index =
243 i + input_offset[0],
j + input_offset[1],
k + input_offset[2]);
244 local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
248 itemID.barrier(cl::sycl::access::fence_space::local_space);
252 if (boundary_check(itemID.get_global_id() < input_range)) {
253 CoeffReturnType result =
static_cast<CoeffReturnType
>(0);
254 for (
size_t k = 0;
k < kernel_size[2];
k++) {
255 for (
size_t j = 0;
j < kernel_size[1];
j++) {
256 for (
size_t i = 0;
i < kernel_size[0];
i++) {
257 const size_t kernel_index =
i + kernel_size[0] * (
j + kernel_size[1] *
k);
258 const size_t local_index =
259 ((
i + itemID.get_local_id(0)) +
260 num_input[0] * ((
j + itemID.get_local_id(1)) + num_input[1] * (
k + itemID.get_local_id(2))));
262 result += (local_acc[local_index] * kernel_ptr[kernel_index]);
266 const size_t tensor_index =
269 buffer_ptr[tensor_index] = result;
272 itemID.barrier(cl::sycl::access::fence_space::local_space);
277 template <
typename Indices,
typename InputArgType,
typename KernelArgType>
281 static constexpr
int NumDims =
302 PreferBlockAccess =
false,
312 : m_inputImpl(
op.inputExpression(), device),
313 m_kernelArg(
op.kernelExpression()),
314 m_kernelImpl(
op.kernelExpression(), device),
315 m_indices(
op.indices()),
318 m_local_kernel(false),
322 YOU_MADE_A_PROGRAMMING_MISTAKE);
328 m_dimensions = m_inputImpl.dimensions();
329 for (
int i = 0;
i < NumKernelDims; ++
i) {
330 const Index index =
op.indices()[
i];
331 const Index input_dim = input_dims[index];
332 const Index kernel_dim = kernel_dims[
i];
333 const Index result_dim = input_dim - kernel_dim + 1;
334 m_dimensions[index] = result_dim;
342 m_inputImpl.evalSubExprsIfNeeded(NULL);
355 m_inputImpl.cleanup();
360 if (m_local_kernel) {
362 m_local_kernel =
false;
377 m_local_kernel =
false;
379 ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(
Scalar);
382 EvalTo evalToTmp(
m_device.get(local), m_kernelArg);
386 m_local_kernel =
true;
392 typedef typename InputEvaluator::Dimensions InputDims;
393 switch (NumKernelDims) {
395 const size_t numX =
dimensions()[m_indices[0]];
396 const size_t numP =
dimensions().TotalSize() / numX;
397 const auto input_dim = std::array<size_t, 2>{numX, numP};
398 auto global_range = cl::sycl::range<2>{1, 1};
399 auto local_range = cl::sycl::range<2>{1, 1};
400 const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
402 m_device.parallel_for_setup(input_dim, global_range, local_range);
403 const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
404 gpu_assert(
static_cast<unsigned long>(local_memory_size) <=
m_device.sharedMemPerBlock());
414 .template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
415 m_inputImpl, m_kernel,
data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
416 indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]))
422 auto kernel_index = std::array<size_t, 2>{
static_cast<int>(
Layout) ==
static_cast<int>(
ColMajor) ? 0 : 1,
424 auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
425 (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
426 const size_t numX =
dimensions()[m_indices[kernel_index[0]]];
427 const size_t numY =
dimensions()[m_indices[kernel_index[1]]];
428 const size_t numP =
dimensions().TotalSize() / (numX * numY);
429 auto input_dim = std::array<size_t, 3>{numX, numY, numP};
431 auto global_range = cl::sycl::range<3>{1, 1, 1};
432 auto local_range = cl::sycl::range<3>{1, 1, 1};
434 m_device.parallel_for_setup(input_dim, global_range, local_range);
436 const size_t local_memory_size =
437 (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
438 gpu_assert(
static_cast<unsigned long>(local_memory_size) <=
m_device.sharedMemPerBlock());
439 const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
441 {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
447 .template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
448 m_inputImpl, m_kernel,
data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
449 indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]})
455 auto kernel_index = std::array<size_t, 3>{
static_cast<int>(
Layout) ==
static_cast<int>(
ColMajor) ? 0 : 2,
459 auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
460 (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
461 (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
463 const size_t numX =
dimensions()[m_indices[kernel_index[0]]];
464 const size_t numY =
dimensions()[m_indices[kernel_index[1]]];
465 const size_t numZ =
dimensions()[m_indices[kernel_index[2]]];
466 auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
467 const size_t numP =
dimensions().TotalSize() / (numX * numY * numZ);
470 {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
471 const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
472 m_kernelImpl.dimensions()[kernel_index[1]],
473 m_kernelImpl.dimensions()[kernel_index[2]]}};
477 auto global_range = cl::sycl::range<3>{1, 1, 1};
478 auto local_range = cl::sycl::range<3>{1, 1, 1};
480 m_device.parallel_for_setup(input_dim, global_range, local_range);
481 auto local_memory_range = (local_range + kernel_size - 1);
482 const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
484 gpu_assert(
static_cast<unsigned long>(local_memory_size) <=
m_device.sharedMemPerBlock());
489 .template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
490 m_inputImpl, m_kernel,
data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
491 indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP)
498 THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
509 template <
int LoadMode>
513 return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
519 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
521 const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
522 const double firstIndex_compute_cost =
524 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
526 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
#define EIGEN_STATIC_ASSERT(X, MSG)
Definition: StaticAssert.h:26
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:34
float * p
Definition: Tutorial_Map_using.cpp:9
SCALAR Scalar
Definition: bench_gemm.cpp:45
Definition: TensorConvolution.h:236
Eigen::internal::traits< TensorConvolutionOp >::Index Index
Definition: TensorConvolution.h:244
internal::promote_storage_type< typename InputXprType::CoeffReturnType, typename KernelXprType::CoeffReturnType >::ret CoeffReturnType
Definition: TensorConvolution.h:241
Definition: TensorEvalTo.h:61
Definition: TensorCostModel.h:28
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const
Definition: TensorConvolution.h:136
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const
Definition: TensorConvolution.h:162
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const
Definition: TensorConvolution.h:110
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const
Definition: TensorConvolution.h:167
Definition: TensorBlock.h:566
static EIGEN_STRONG_INLINE void run(const Expression &expr, const Device &device=DefaultDevice())
Definition: TensorExecutor.h:92
@ ColMajor
Definition: Constants.h:318
char char char int int * k
Definition: level2_impl.h:374
char char * op
Definition: level2_impl.h:374
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
std::array< T, N > array
Definition: EmulateArray.h:231
squared absolute value
Definition: GlobalFunctions.h:87
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
convolution_type
Definition: TensorConvolutionSycl.h:31
Type
Type of JSON value.
Definition: rapidjson.h:513
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Local_accessor
Definition: TensorConvolutionSycl.h:107
Buffer_accessor buffer_acc
Definition: TensorConvolutionSycl.h:111
Evaluator device_evaluator
Definition: TensorConvolutionSycl.h:109
Local_accessor local_acc
Definition: TensorConvolutionSycl.h:108
void operator()(cl::sycl::nd_item< 3 > itemID) const
Definition: TensorConvolutionSycl.h:131
Kernel_accessor kernel_filter
Definition: TensorConvolutionSycl.h:110
const cl::sycl::range< 2 > kernel_size
Definition: TensorConvolutionSycl.h:113
const cl::sycl::range< 3 > input_range
Definition: TensorConvolutionSycl.h:114
internal::IndexMapper< Index, InputDims, 2, Evaluator::Layout > indexMapper
Definition: TensorConvolutionSycl.h:112
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, Buffer_accessor buffer_acc_, internal::IndexMapper< Index, InputDims, 2, Evaluator::Layout > indexMapper_, const cl::sycl::range< 2 > kernel_size_, const cl::sycl::range< 3 > input_range_)
Definition: TensorConvolutionSycl.h:115
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const
Definition: TensorConvolutionSycl.h:127
Evaluator device_evaluator
Definition: TensorConvolutionSycl.h:42
Buffer_accessor buffer_acc
Definition: TensorConvolutionSycl.h:44
Local_accessor local_acc
Definition: TensorConvolutionSycl.h:41
internal::IndexMapper< Index, InputDims, 1, Evaluator::Layout > indexMapper
Definition: TensorConvolutionSycl.h:45
void operator()(cl::sycl::nd_item< 2 > itemID) const
Definition: TensorConvolutionSycl.h:64
const cl::sycl::range< 2 > input_range
Definition: TensorConvolutionSycl.h:47
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) const
Definition: TensorConvolutionSycl.h:61
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, Buffer_accessor buffer_acc_, internal::IndexMapper< Index, InputDims, 1, Evaluator::Layout > indexMapper_, const size_t kernelSize_, const cl::sycl::range< 2 > input_range_)
Definition: TensorConvolutionSycl.h:48
const size_t kernelSize
Definition: TensorConvolutionSycl.h:46
Kernel_accessor kernel_filter
Definition: TensorConvolutionSycl.h:43
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Local_accessor
Definition: TensorConvolutionSycl.h:40
void operator()(cl::sycl::nd_item< 3 > itemID) const
Definition: TensorConvolutionSycl.h:219
Kernel_accessor kernel_filter
Definition: TensorConvolutionSycl.h:195
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Local_accessor
Definition: TensorConvolutionSycl.h:192
internal::IndexMapper< Index, InputDims, 3, Evaluator::Layout > indexMapper
Definition: TensorConvolutionSycl.h:197
const cl::sycl::range< 3 > input_range
Definition: TensorConvolutionSycl.h:199
Evaluator device_evaluator
Definition: TensorConvolutionSycl.h:194
Local_accessor local_acc
Definition: TensorConvolutionSycl.h:193
Buffer_accessor buffer_acc
Definition: TensorConvolutionSycl.h:196
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const
Definition: TensorConvolutionSycl.h:216
const size_t numP
Definition: TensorConvolutionSycl.h:200
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, Buffer_accessor buffer_acc_, internal::IndexMapper< Index, InputDims, 3, Evaluator::Layout > indexMapper_, const cl::sycl::range< 3 > kernel_size_, const cl::sycl::range< 3 > input_range_, const size_t numP_)
Definition: TensorConvolutionSycl.h:202
const cl::sycl::range< 3 > kernel_size
Definition: TensorConvolutionSycl.h:198
Definition: TensorConvolutionSycl.h:34
Definition: TensorMeta.h:47
Definition: TensorForwardDeclarations.h:42
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31
static constexpr int Layout
Definition: TensorEvaluator.h:46
Derived::Scalar Scalar
Definition: TensorEvaluator.h:33
const Device EIGEN_DEVICE_REF m_device
Definition: TensorEvaluator.h:170
Storage::Type EvaluatorPointerType
Definition: TensorEvaluator.h:41
@ PacketAccess
Definition: TensorEvaluator.h:50
@ IsAligned
Definition: TensorEvaluator.h:49
static constexpr int PacketSize
Definition: TensorEvaluator.h:38
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition: TensorEvaluator.h:165
Derived::Scalar CoeffReturnType
Definition: TensorEvaluator.h:34
Derived::Index Index
Definition: TensorEvaluator.h:32
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition: TensorEvaluator.h:69
Definition: TensorForwardDeclarations.h:175
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2