TensorConvolutionSycl.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
9 
10 //
11 // This Source Code Form is subject to the terms of the Mozilla
12 // Public License v. 2.0. If a copy of the MPL was not distributed
13 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
14 
15 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
16 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
17 
18 // IWYU pragma: private
19 #include "./InternalHeaderCheck.h"
20 
21 namespace Eigen {
22 
32 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
33  typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
35 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
36  typename Kernel_accessor, typename Buffer_accessor>
37 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
38  Buffer_accessor, convolution_type::CONV1D> {
39  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
42  Evaluator device_evaluator;
43  Kernel_accessor kernel_filter;
44  Buffer_accessor buffer_acc;
46  const size_t kernelSize;
47  const cl::sycl::range<2> input_range;
48  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
49  Buffer_accessor buffer_acc_,
51  const size_t kernelSize_, const cl::sycl::range<2> input_range_)
52  : local_acc(local_acc_),
53  device_evaluator(device_evaluator_),
54  kernel_filter(kernel_filter_),
55  buffer_acc(buffer_acc_),
56  indexMapper(indexMapper_),
57  kernelSize(kernelSize_),
58  input_range(input_range_) {}
59 
60  template <typename BooleanDim2>
61  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) const {
62  return (boolean_check[0] && boolean_check[1]);
63  }
64  void operator()(cl::sycl::nd_item<2> itemID) const {
65  auto buffer_ptr = buffer_acc;
66  auto kernel_ptr = kernel_filter;
67  // the required row to be calculated for the for each plane in shered memory
68  const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
69  const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
70  const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
71  const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
73  for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
74  const size_t local_index = i + plane_kernel_offset;
75  const size_t tensor_index =
76  plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
77 
78  local_acc[local_index] =
79  (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
80  ? device_evaluator.coeff(tensor_index)
81  : CoeffReturnType(0);
82  }
83 
84  itemID.barrier(cl::sycl::access::fence_space::local_space);
85 
86  // calculate the convolution // output start x
87  const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
88  if (boundary_check(itemID.get_global_id() < input_range)) {
89  CoeffReturnType result = static_cast<CoeffReturnType>(0);
90  const size_t index = plane_kernel_offset + itemID.get_local_id(0);
91  for (size_t k = 0; k < kernelSize; ++k) {
92  result += (local_acc[k + index] * kernel_ptr[k]);
93  }
94  const size_t tensor_index =
95  indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
96  indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
97  buffer_ptr[tensor_index] = result;
98  }
99  }
100 };
101 
102 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
103  typename Kernel_accessor, typename Buffer_accessor>
104 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
105  Buffer_accessor, convolution_type::CONV2D> {
106  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
109  Evaluator device_evaluator;
110  Kernel_accessor kernel_filter;
111  Buffer_accessor buffer_acc;
113  const cl::sycl::range<2> kernel_size;
114  const cl::sycl::range<3> input_range;
115  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
116  Buffer_accessor buffer_acc_,
118  const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
119  : local_acc(local_acc_),
120  device_evaluator(device_evaluator_),
121  kernel_filter(kernel_filter_),
122  buffer_acc(buffer_acc_),
123  indexMapper(indexMapper_),
124  kernel_size(kernel_size_),
125  input_range(input_range_) {}
126  template <typename BooleanDim3>
127  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
128  return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
129  }
130 
131  void operator()(cl::sycl::nd_item<3> itemID) const {
132  auto buffer_ptr = buffer_acc;
133  auto kernel_ptr = kernel_filter;
134  // the required row to be calculated for the for each plane in shered memory
135  const auto num_input = cl::sycl::range<2>{
136  (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
137 
138  const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
139  const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
140 
141  const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
142  itemID.get_group(1) * itemID.get_local_range()[1]};
143 
144  // fill the local memory
145  bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
146  for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
147  const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
148  bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
149  for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
150  const size_t local_index = i + local_input_offset;
151  const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
152  i + input_offset[0], j + input_offset[1]);
153  local_acc[local_index] =
154  (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) && in_range_dim1 && in_range_dim2)
155  ? device_evaluator.coeff(tensor_index)
156  : CoeffReturnType(0);
157  }
158  }
159 
160  itemID.barrier(cl::sycl::access::fence_space::local_space);
161 
162  // output offset start for each thread
163  const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
164  itemID.get_group(1) * itemID.get_local_range()[1]};
165 
166  if (boundary_check(itemID.get_global_id() < input_range)) {
167  CoeffReturnType result = static_cast<CoeffReturnType>(0);
168 
169  for (size_t j = 0; j < kernel_size[1]; j++) {
170  size_t kernel_offset = kernel_size[0] * j;
171  const size_t index =
172  (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
173  for (size_t i = 0; i < kernel_size[0]; i++) {
174  result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
175  }
176  }
177  const size_t tensor_index =
178  indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
179  indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
180  itemID.get_local_id(1) + output_offset[1]);
181 
182  buffer_ptr[tensor_index] = result;
183  }
184  }
185 };
186 
187 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
188  typename Kernel_accessor, typename Buffer_accessor>
189 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
190  Buffer_accessor, convolution_type::CONV3D> {
191  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
194  Evaluator device_evaluator;
195  Kernel_accessor kernel_filter;
196  Buffer_accessor buffer_acc;
198  const cl::sycl::range<3> kernel_size;
199  const cl::sycl::range<3> input_range;
200  const size_t numP;
201 
202  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
203  Buffer_accessor buffer_acc_,
205  const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
206  const size_t numP_)
207  : local_acc(local_acc_),
208  device_evaluator(device_evaluator_),
209  kernel_filter(kernel_filter_),
210  buffer_acc(buffer_acc_),
211  indexMapper(indexMapper_),
212  kernel_size(kernel_size_),
213  input_range(input_range_),
214  numP(numP_) {}
215  template <typename BooleanDim3>
216  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
217  return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
218  }
219  void operator()(cl::sycl::nd_item<3> itemID) const {
220  auto buffer_ptr = buffer_acc;
221  auto kernel_ptr = kernel_filter;
222  const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
223 
224  const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
225 
226  const auto output_offset =
227  cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
228 
229  for (size_t p = 0; p < numP; p++) {
231  const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
232  for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
233  size_t local_index_dim2 = num_input[0] * num_input[1] * k;
234  bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
235  for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
236  bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
237  size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2;
238  for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
239  bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
240  const size_t local_index = local_index_dim1 + i;
241  const size_t tensor_index =
242  plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
243  i + input_offset[0], j + input_offset[1], k + input_offset[2]);
244  local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
245  }
246  }
247  }
248  itemID.barrier(cl::sycl::access::fence_space::local_space);
249 
250  // calculate the convolution
251 
252  if (boundary_check(itemID.get_global_id() < input_range)) {
253  CoeffReturnType result = static_cast<CoeffReturnType>(0);
254  for (size_t k = 0; k < kernel_size[2]; k++) {
255  for (size_t j = 0; j < kernel_size[1]; j++) {
256  for (size_t i = 0; i < kernel_size[0]; i++) {
257  const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
258  const size_t local_index =
259  ((i + itemID.get_local_id(0)) +
260  num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
261 
262  result += (local_acc[local_index] * kernel_ptr[kernel_index]);
263  }
264  }
265  }
266  const size_t tensor_index =
268  indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
269  buffer_ptr[tensor_index] = result;
270  }
271 
272  itemID.barrier(cl::sycl::access::fence_space::local_space);
273  }
274  }
275 };
276 
277 template <typename Indices, typename InputArgType, typename KernelArgType>
278 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
280 
281  static constexpr int NumDims =
283  static constexpr int NumKernelDims = internal::array_size<Indices>::value;
284  typedef typename XprType::Index Index;
287  typedef const Eigen::SyclDevice Device;
290  typedef typename InputArgType::Scalar Scalar;
295 
297  enum {
300  PacketAccess = false,
301  BlockAccess = false,
302  PreferBlockAccess = false,
303  CoordAccess = false, // to be implemented
304  RawAccess = false
305  };
306 
307  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
309  //===--------------------------------------------------------------------===//
310 
311  TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
312  : m_inputImpl(op.inputExpression(), device),
313  m_kernelArg(op.kernelExpression()),
314  m_kernelImpl(op.kernelExpression(), device),
315  m_indices(op.indices()),
316  m_buf(NULL),
317  m_kernel(NULL),
318  m_local_kernel(false),
319  m_device(device) {
322  YOU_MADE_A_PROGRAMMING_MISTAKE);
323 
324  const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
326  m_kernelImpl.dimensions();
327 
328  m_dimensions = m_inputImpl.dimensions();
329  for (int i = 0; i < NumKernelDims; ++i) {
330  const Index index = op.indices()[i];
331  const Index input_dim = input_dims[index];
332  const Index kernel_dim = kernel_dims[i];
333  const Index result_dim = input_dim - kernel_dim + 1;
334  m_dimensions[index] = result_dim;
335  }
336  }
337 
338  EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
339 
341  preloadKernel();
342  m_inputImpl.evalSubExprsIfNeeded(NULL);
343  if (data) {
344  executeEval(data);
345  return false;
346  } else {
347  m_buf = (EvaluatorPointerType)m_device.get(
348  (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
349  executeEval(m_buf);
350  return true;
351  }
352  }
353 
355  m_inputImpl.cleanup();
356  if (m_buf) {
357  m_device.deallocate_temp(m_buf);
358  m_buf = NULL;
359  }
360  if (m_local_kernel) {
361  m_device.deallocate_temp(m_kernel);
362  m_local_kernel = false;
363  }
364  m_kernel = NULL;
365  }
370 
372  // Don't make a local copy of the kernel unless we have to (i.e. it's an
373  // expression that needs to be evaluated)
374  typename KernelStorage::Type in_place = m_kernelImpl.data();
375  if (in_place) {
376  m_kernel = in_place;
377  m_local_kernel = false;
378  } else {
379  ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
380  EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
382  EvalTo evalToTmp(m_device.get(local), m_kernelArg);
385  m_kernel = local;
386  m_local_kernel = true;
387  }
388  }
389 
392  typedef typename InputEvaluator::Dimensions InputDims;
393  switch (NumKernelDims) {
394  case 1: {
395  const size_t numX = dimensions()[m_indices[0]];
396  const size_t numP = dimensions().TotalSize() / numX;
397  const auto input_dim = std::array<size_t, 2>{numX, numP};
398  auto global_range = cl::sycl::range<2>{1, 1};
399  auto local_range = cl::sycl::range<2>{1, 1};
400  const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
401 
402  m_device.parallel_for_setup(input_dim, global_range, local_range);
403  const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
404  gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
405  const array<Index, 1> indices{{m_indices[0]}};
406  const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
407  internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
408 
409  typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
411  ConvKernel;
412 
413  m_device
414  .template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
415  m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
416  indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]))
417  .wait();
418  break;
419  }
420 
421  case 2: {
422  auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
423  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
424  auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
425  (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
426  const size_t numX = dimensions()[m_indices[kernel_index[0]]];
427  const size_t numY = dimensions()[m_indices[kernel_index[1]]];
428  const size_t numP = dimensions().TotalSize() / (numX * numY);
429  auto input_dim = std::array<size_t, 3>{numX, numY, numP};
430 
431  auto global_range = cl::sycl::range<3>{1, 1, 1};
432  auto local_range = cl::sycl::range<3>{1, 1, 1};
433 
434  m_device.parallel_for_setup(input_dim, global_range, local_range);
435 
436  const size_t local_memory_size =
437  (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
438  gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
439  const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
440  const array<Index, 2> kernel_dims{
441  {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
442  internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
443  typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
445  ConvKernel;
446  m_device
447  .template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
448  m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
449  indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]})
450  .wait();
451  break;
452  }
453 
454  case 3: {
455  auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
456  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
457  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
458 
459  auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
460  (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
461  (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
462 
463  const size_t numX = dimensions()[m_indices[kernel_index[0]]];
464  const size_t numY = dimensions()[m_indices[kernel_index[1]]];
465  const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
466  auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
467  const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
468 
469  const array<Index, 3> indices{
470  {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
471  const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
472  m_kernelImpl.dimensions()[kernel_index[1]],
473  m_kernelImpl.dimensions()[kernel_index[2]]}};
474 
475  internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
476 
477  auto global_range = cl::sycl::range<3>{1, 1, 1};
478  auto local_range = cl::sycl::range<3>{1, 1, 1};
479 
480  m_device.parallel_for_setup(input_dim, global_range, local_range);
481  auto local_memory_range = (local_range + kernel_size - 1);
482  const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
483 
484  gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
485  typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
487  ConvKernel;
488  m_device
489  .template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
490  m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
491  indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP)
492  .wait();
493  break;
494  }
495 
496  default: {
497  EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
498  THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
499  }
500  }
501  }
502 
504  eigen_assert(m_buf != NULL);
505  eigen_assert(index < m_dimensions.TotalSize());
506  return m_buf[index];
507  }
508 
509  template <int LoadMode>
511  eigen_assert(m_buf != NULL);
512  eigen_assert(index < m_dimensions.TotalSize());
513  return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
514  }
515 
517  // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
518  // model.
519  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
520  // We ignore the use of fused multiply-add.
521  const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
522  const double firstIndex_compute_cost =
523  NumDims *
524  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
525  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
526  kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
527  TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
528  }
529 
530  private:
531  // No assignment (copies are needed by the kernels)
534  KernelArgType m_kernelArg;
536  Indices m_indices;
541  const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
542 }; // namespace Eigen
543 
544 } // end namespace Eigen
545 
546 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
#define EIGEN_STATIC_ASSERT(X, MSG)
Definition: StaticAssert.h:26
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:34
float * p
Definition: Tutorial_Map_using.cpp:9
SCALAR Scalar
Definition: bench_gemm.cpp:45
Definition: TensorConvolution.h:236
Eigen::internal::traits< TensorConvolutionOp >::Index Index
Definition: TensorConvolution.h:244
internal::promote_storage_type< typename InputXprType::CoeffReturnType, typename KernelXprType::CoeffReturnType >::ret CoeffReturnType
Definition: TensorConvolution.h:241
Definition: TensorEvalTo.h:61
Definition: TensorCostModel.h:28
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const
Definition: TensorConvolution.h:136
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const
Definition: TensorConvolution.h:162
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const
Definition: TensorConvolution.h:110
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const
Definition: TensorConvolution.h:167
Definition: TensorBlock.h:566
static EIGEN_STRONG_INLINE void run(const Expression &expr, const Device &device=DefaultDevice())
Definition: TensorExecutor.h:92
@ ColMajor
Definition: Constants.h:318
char char char int int * k
Definition: level2_impl.h:374
char char * op
Definition: level2_impl.h:374
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
std::array< T, N > array
Definition: EmulateArray.h:231
squared absolute value
Definition: GlobalFunctions.h:87
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
convolution_type
Definition: TensorConvolutionSycl.h:31
in_place
Definition: fix_broken_doxygen_formulae.py:249
Type
Type of JSON value.
Definition: rapidjson.h:513
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Local_accessor
Definition: TensorConvolutionSycl.h:107
internal::IndexMapper< Index, InputDims, 2, Evaluator::Layout > indexMapper
Definition: TensorConvolutionSycl.h:112
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, Buffer_accessor buffer_acc_, internal::IndexMapper< Index, InputDims, 2, Evaluator::Layout > indexMapper_, const cl::sycl::range< 2 > kernel_size_, const cl::sycl::range< 3 > input_range_)
Definition: TensorConvolutionSycl.h:115
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const
Definition: TensorConvolutionSycl.h:127
internal::IndexMapper< Index, InputDims, 1, Evaluator::Layout > indexMapper
Definition: TensorConvolutionSycl.h:45
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) const
Definition: TensorConvolutionSycl.h:61
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, Buffer_accessor buffer_acc_, internal::IndexMapper< Index, InputDims, 1, Evaluator::Layout > indexMapper_, const size_t kernelSize_, const cl::sycl::range< 2 > input_range_)
Definition: TensorConvolutionSycl.h:48
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Local_accessor
Definition: TensorConvolutionSycl.h:40
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Local_accessor
Definition: TensorConvolutionSycl.h:192
internal::IndexMapper< Index, InputDims, 3, Evaluator::Layout > indexMapper
Definition: TensorConvolutionSycl.h:197
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const
Definition: TensorConvolutionSycl.h:216
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, Buffer_accessor buffer_acc_, internal::IndexMapper< Index, InputDims, 3, Evaluator::Layout > indexMapper_, const cl::sycl::range< 3 > kernel_size_, const cl::sycl::range< 3 > input_range_, const size_t numP_)
Definition: TensorConvolutionSycl.h:202
Definition: TensorConvolutionSycl.h:34
Definition: TensorMeta.h:47
Definition: TensorForwardDeclarations.h:42
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
Definition: TensorConvolutionSycl.h:340
PacketType< CoeffReturnType, Eigen::SyclDevice >::type PacketReturnType
Definition: TensorConvolutionSycl.h:289
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
Definition: TensorConvolutionSycl.h:516
internal::TensorBlockNotImplemented TensorBlock
Definition: TensorConvolutionSycl.h:308
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device & device() const
used by sycl in order to build the sycl buffer
Definition: TensorConvolutionSycl.h:367
StorageMemory< const CoeffReturnType, Eigen::SyclDevice > KernelStorage
Definition: TensorConvolutionSycl.h:294
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
used by sycl in order to build the sycl buffer
Definition: TensorConvolutionSycl.h:369
TensorEvaluator< KernelArgType, Eigen::SyclDevice >::Dimensions KernelDimensions
Definition: TensorConvolutionSycl.h:286
TensorConvolutionOp< Indices, InputArgType, KernelArgType > XprType
Definition: TensorConvolutionSycl.h:279
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const
Definition: TensorConvolutionSycl.h:390
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
Definition: TensorConvolutionSycl.h:510
TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
Definition: TensorConvolutionSycl.h:311
StorageMemory< CoeffReturnType, Eigen::SyclDevice > Storage
Definition: TensorConvolutionSycl.h:292
EIGEN_DEVICE_FUNC const Dimensions & dimensions() const
Definition: TensorConvolutionSycl.h:338
const Eigen::SyclDevice EIGEN_DEVICE_REF m_device
Definition: TensorConvolutionSycl.h:541
TensorEvaluator< InputArgType, Eigen::SyclDevice > m_inputImpl
Definition: TensorConvolutionSycl.h:533
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel()
Definition: TensorConvolutionSycl.h:371
TensorEvaluator< KernelArgType, Eigen::SyclDevice > m_kernelImpl
Definition: TensorConvolutionSycl.h:535
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition: TensorConvolutionSycl.h:503
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31
static constexpr int Layout
Definition: TensorEvaluator.h:46
Derived::Scalar Scalar
Definition: TensorEvaluator.h:33
const Device EIGEN_DEVICE_REF m_device
Definition: TensorEvaluator.h:170
Storage::Type EvaluatorPointerType
Definition: TensorEvaluator.h:41
@ PacketAccess
Definition: TensorEvaluator.h:50
@ IsAligned
Definition: TensorEvaluator.h:49
static constexpr int PacketSize
Definition: TensorEvaluator.h:38
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition: TensorEvaluator.h:165
Derived::Scalar CoeffReturnType
Definition: TensorEvaluator.h:34
Derived::Index Index
Definition: TensorEvaluator.h:32
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition: TensorEvaluator.h:69
Definition: TensorForwardDeclarations.h:175
Definition: Meta.h:305
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2