gpu_test_helper.h
Go to the documentation of this file.
1 #ifndef GPU_TEST_HELPER_H
2 #define GPU_TEST_HELPER_H
3 
4 #include <Eigen/Core>
5 
6 // Allow gpu** macros for generic tests.
8 
9 // std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that
10 // doesn't allow std::tuple to compile for host code either. In these cases,
11 // use our custom implementation.
12 #if defined(EIGEN_GPU_COMPILE_PHASE) || (defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER < 92000)
13 #define EIGEN_USE_CUSTOM_TUPLE 1
14 #else
15 #define EIGEN_USE_CUSTOM_TUPLE 0
16 #endif
17 
18 #if EIGEN_USE_CUSTOM_TUPLE
19 #include "../Eigen/src/Core/arch/GPU/Tuple.h"
20 #else
21 #include <tuple>
22 #endif
23 namespace Eigen {
24 
25 namespace internal {
26 
27 // Note: cannot re-use tuple_impl, since that will cause havoc for
28 // tuple_test.
29 namespace test_detail {
30 // Use std::tuple on CPU, otherwise use the GPU-specific versions.
31 #if !EIGEN_USE_CUSTOM_TUPLE
32 using std::get;
33 using std::make_tuple;
34 using std::tie;
35 using std::tuple;
36 #else
37 using tuple_impl::get;
39 using tuple_impl::tie;
40 using tuple_impl::tuple;
41 #endif
42 #undef EIGEN_USE_CUSTOM_TUPLE
43 } // namespace test_detail
44 
45 template <size_t N, size_t Idx, typename OutputIndexSequence, typename... Ts>
47 
59 template <size_t N, size_t Idx, size_t... OutputIndices, typename T1, typename... Ts>
60 struct extract_output_indices_helper<N, Idx, std::index_sequence<OutputIndices...>, T1, Ts...> {
62  N - 1, Idx + 1,
63  typename std::conditional<
64  // If is a non-const l-value reference, append index.
65  std::is_lvalue_reference<T1>::value && !std::is_const<std::remove_reference_t<T1>>::value,
66  std::index_sequence<OutputIndices..., Idx>, std::index_sequence<OutputIndices...>>::type,
67  Ts...>::type;
68 };
69 
70 // Base case.
71 template <size_t Idx, size_t... OutputIndices>
72 struct extract_output_indices_helper<0, Idx, std::index_sequence<OutputIndices...>> {
73  using type = std::index_sequence<OutputIndices...>;
74 };
75 
76 // Extracts a set of indices into Types... that correspond to non-const
77 // l-value references.
78 template <typename... Types>
80  typename extract_output_indices_helper<sizeof...(Types), 0, std::index_sequence<>, Types...>::type;
81 
82 // Helper struct for dealing with Generic functors that may return void.
83 struct void_helper {
84  struct Void {};
85 
86  // Converts void -> Void, T otherwise.
87  template <typename T>
89 
90  // Non-void return value.
91  template <typename Func, typename... Args>
92  static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args)
93  -> std::enable_if_t<!std::is_same<decltype(func(args...)), void>::value, decltype(func(args...))> {
94  return func(std::forward<Args>(args)...);
95  }
96 
97  // Void return value.
98  template <typename Func, typename... Args>
99  static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args)
100  -> std::enable_if_t<std::is_same<decltype(func(args...)), void>::value, Void> {
101  func(std::forward<Args>(args)...);
102  return Void{};
103  }
104 
105  // Restores the original return type, Void -> void, T otherwise.
106  template <typename T>
109  restore(T&& val) {
110  return val;
111  }
112 
113  // Void case.
114  template <typename T = void>
116 };
117 
118 // Runs a kernel via serialized buffer. Does this by deserializing the buffer
119 // to construct the arguments, calling the kernel, then re-serialing the outputs.
120 // The buffer contains
121 // [ input_buffer_size, args ]
122 // After the kernel call, it is then populated with
123 // [ output_buffer_size, output_parameters, return_value ]
124 // If the output_buffer_size exceeds the buffer's capacity, then only the
125 // output_buffer_size is populated.
126 template <typename Kernel, typename... Args, size_t... Indices, size_t... OutputIndices>
127 EIGEN_DEVICE_FUNC void run_serialized(std::index_sequence<Indices...>, std::index_sequence<OutputIndices...>,
128  Kernel kernel, uint8_t* buffer, size_t capacity) {
129  using test_detail::get;
131  using test_detail::tuple;
132  // Deserialize input size and inputs.
133  size_t input_size;
134  const uint8_t* read_ptr = buffer;
135  const uint8_t* read_end = buffer + capacity;
136  read_ptr = Eigen::deserialize(read_ptr, read_end, input_size);
137  // Create value-type instances to populate.
138  auto args = make_tuple(typename std::decay<Args>::type{}...);
139  EIGEN_UNUSED_VARIABLE(args) // Avoid NVCC compile warning.
140  // NVCC 9.1 requires us to spell out the template parameters explicitly.
141  read_ptr = Eigen::deserialize(read_ptr, read_end, get<Indices, typename std::decay<Args>::type...>(args)...);
142 
143  // Call function, with void->Void conversion so we are guaranteed a complete
144  // output type.
145  auto result = void_helper::call(kernel, get<Indices, typename std::decay<Args>::type...>(args)...);
146 
147  // Determine required output size.
148  size_t output_size = Eigen::serialize_size(capacity);
149  output_size += Eigen::serialize_size(get<OutputIndices, typename std::decay<Args>::type...>(args)...);
150  output_size += Eigen::serialize_size(result);
151 
152  // Always serialize required buffer size.
153  uint8_t* write_ptr = buffer;
154  uint8_t* write_end = buffer + capacity;
155  write_ptr = Eigen::serialize(write_ptr, write_end, output_size);
156  // Null `write_ptr` can be safely passed along.
157  // Serialize outputs if they fit in the buffer.
158  if (output_size <= capacity) {
159  // Collect outputs and result.
160  write_ptr = Eigen::serialize(write_ptr, write_end, get<OutputIndices, typename std::decay<Args>::type...>(args)...);
161  write_ptr = Eigen::serialize(write_ptr, write_end, result);
162  }
163 }
164 
165 template <typename Kernel, typename... Args>
166 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_serialized(Kernel kernel, uint8_t* buffer, size_t capacity) {
167  run_serialized<Kernel, Args...>(std::make_index_sequence<sizeof...(Args)>{}, extract_output_indices<Args...>{},
168  kernel, buffer, capacity);
169 }
170 
171 #ifdef EIGEN_GPUCC
172 
173 // Checks for GPU errors and asserts / prints the error message.
174 #define GPU_CHECK(expr) \
175  do { \
176  gpuError_t err = expr; \
177  if (err != gpuSuccess) { \
178  printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err)); \
179  gpu_assert(false); \
180  } \
181  } while (0)
182 
183 // Calls run_serialized on the GPU.
184 template <typename Kernel, typename... Args>
185 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void run_serialized_on_gpu_meta_kernel(const Kernel kernel, uint8_t* buffer,
186  size_t capacity) {
187  run_serialized<Kernel, Args...>(kernel, buffer, capacity);
188 }
189 
190 // Runs kernel(args...) on the GPU via the serialization mechanism.
191 //
192 // Note: this may end up calling the kernel multiple times if the initial output
193 // buffer is not large enough to hold the outputs.
194 template <typename Kernel, typename... Args, size_t... Indices, size_t... OutputIndices>
195 auto run_serialized_on_gpu(size_t buffer_capacity_hint, std::index_sequence<Indices...>,
196  std::index_sequence<OutputIndices...>, Kernel kernel, Args&&... args)
197  -> decltype(kernel(args...)) {
198  // Compute the required serialization buffer capacity.
199  // Round up input size to next power of two to give a little extra room
200  // for outputs.
201  size_t input_data_size = sizeof(size_t) + Eigen::serialize_size(args...);
202 
203  size_t capacity;
204  if (buffer_capacity_hint == 0) {
205  // Estimate as the power of two larger than the total input size.
206  capacity = sizeof(size_t);
207  while (capacity <= input_data_size) {
208  capacity *= 2;
209  }
210  } else {
211  // Use the larger of the hint and the total input size.
212  // Add sizeof(size_t) to the hint to account for storing the buffer capacity
213  // itself so the user doesn't need to think about this.
214  capacity = std::max<size_t>(buffer_capacity_hint + sizeof(size_t), input_data_size);
215  }
216  std::vector<uint8_t> buffer(capacity);
217 
218  uint8_t* host_data = nullptr;
219  uint8_t* host_data_end = nullptr;
220  uint8_t* host_ptr = nullptr;
221  uint8_t* device_data = nullptr;
222  size_t output_data_size = 0;
223 
224  // Allocate buffers and copy input data.
225  capacity = std::max<size_t>(capacity, output_data_size);
226  buffer.resize(capacity);
227  host_data = buffer.data();
228  host_data_end = buffer.data() + capacity;
229  host_ptr = Eigen::serialize(host_data, host_data_end, input_data_size);
230  host_ptr = Eigen::serialize(host_ptr, host_data_end, args...);
231 
232  // Copy inputs to host.
233  gpuMalloc((void**)(&device_data), capacity);
234  gpuMemcpy(device_data, buffer.data(), input_data_size, gpuMemcpyHostToDevice);
235  GPU_CHECK(gpuDeviceSynchronize());
236 
237 // Run kernel.
238 #ifdef EIGEN_USE_HIP
239  hipLaunchKernelGGL(HIP_KERNEL_NAME(run_serialized_on_gpu_meta_kernel<Kernel, Args...>), 1, 1, 0, 0, kernel,
240  device_data, capacity);
241 #else
242  run_serialized_on_gpu_meta_kernel<Kernel, Args...><<<1, 1>>>(kernel, device_data, capacity);
243 #endif
244  // Check pre-launch and kernel execution errors.
245  GPU_CHECK(gpuGetLastError());
246  GPU_CHECK(gpuDeviceSynchronize());
247  // Copy back new output to host.
248  gpuMemcpy(host_data, device_data, capacity, gpuMemcpyDeviceToHost);
249  gpuFree(device_data);
250  GPU_CHECK(gpuDeviceSynchronize());
251 
252  // Determine output buffer size.
253  const uint8_t* c_host_ptr = Eigen::deserialize(host_data, host_data_end, output_data_size);
254  // If the output doesn't fit in the buffer, spit out warning and fail.
255  if (output_data_size > capacity) {
256  std::cerr << "The serialized output does not fit in the output buffer, " << output_data_size << " vs capacity "
257  << capacity << "." << std::endl
258  << "Try specifying a minimum buffer capacity: " << std::endl
259  << " run_with_hint(" << output_data_size << ", ...)" << std::endl;
260  VERIFY(false);
261  }
262 
263  // Deserialize outputs.
264  auto args_tuple = test_detail::tie(args...);
265  EIGEN_UNUSED_VARIABLE(args_tuple) // Avoid NVCC compile warning.
266  c_host_ptr = Eigen::deserialize(c_host_ptr, host_data_end, test_detail::get<OutputIndices, Args&...>(args_tuple)...);
267 
268  // Maybe deserialize return value, properly handling void.
269  typename void_helper::ReturnType<decltype(kernel(args...))> result;
270  c_host_ptr = Eigen::deserialize(c_host_ptr, host_data_end, result);
271  return void_helper::restore(result);
272 }
273 
274 #endif // EIGEN_GPUCC
275 
276 } // namespace internal
277 
284 template <typename Kernel, typename... Args>
285 auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
286  return kernel(std::forward<Args>(args)...);
287 }
288 
289 #ifdef EIGEN_GPUCC
290 
303 template <typename Kernel, typename... Args>
304 auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
305  return internal::run_serialized_on_gpu<Kernel, Args...>(
306  /*buffer_capacity_hint=*/0, std::make_index_sequence<sizeof...(Args)>{},
307  internal::extract_output_indices<Args...>{}, kernel, std::forward<Args>(args)...);
308 }
309 
324 template <typename Kernel, typename... Args>
325 auto run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
326  return internal::run_serialized_on_gpu<Kernel, Args...>(
327  buffer_capacity_hint, std::make_index_sequence<sizeof...(Args)>{}, internal::extract_output_indices<Args...>{},
328  kernel, std::forward<Args>(args)...);
329 }
330 
335 struct CompileTimeDeviceInfoKernel {
336  struct Info {
337  int cuda;
338  int hip;
339  };
340 
341  EIGEN_DEVICE_FUNC Info operator()() const {
342  Info info = {-1, -1};
343 #if defined(__CUDA_ARCH__)
344  info.cuda = static_cast<int>(__CUDA_ARCH__ + 0);
345 #endif
346 #if defined(EIGEN_HIP_DEVICE_COMPILE)
347  info.hip = static_cast<int>(EIGEN_HIP_DEVICE_COMPILE + 0);
348 #endif
349  return info;
350  }
351 };
352 
356 void print_gpu_device_info() {
357  int device = 0;
358  gpuDeviceProp_t deviceProp;
359  gpuGetDeviceProperties(&deviceProp, device);
360 
361  auto info = run_on_gpu(CompileTimeDeviceInfoKernel());
362 
363  std::cout << "GPU compile-time info:\n";
364 
365 #ifdef EIGEN_CUDACC
366  std::cout << " EIGEN_CUDACC: " << int(EIGEN_CUDACC) << std::endl;
367 #endif
368 
369 #ifdef EIGEN_CUDA_SDK_VER
370  std::cout << " EIGEN_CUDA_SDK_VER: " << int(EIGEN_CUDA_SDK_VER) << std::endl;
371 #endif
372 
373 #if EIGEN_COMP_NVCC
374  std::cout << " EIGEN_COMP_NVCC: " << int(EIGEN_COMP_NVCC) << std::endl;
375 #endif
376 
377 #ifdef EIGEN_HIPCC
378  std::cout << " EIGEN_HIPCC: " << int(EIGEN_HIPCC) << std::endl;
379 #endif
380 
381  std::cout << " EIGEN_CUDA_ARCH: " << info.cuda << std::endl;
382  std::cout << " EIGEN_HIP_DEVICE_COMPILE: " << info.hip << std::endl;
383 
384  std::cout << "GPU device info:\n";
385  std::cout << " name: " << deviceProp.name << std::endl;
386  std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << std::endl;
387  std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << std::endl;
388  std::cout << " maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << std::endl;
389  std::cout << " warpSize: " << deviceProp.warpSize << std::endl;
390  std::cout << " regsPerBlock: " << deviceProp.regsPerBlock << std::endl;
391  std::cout << " concurrentKernels: " << deviceProp.concurrentKernels << std::endl;
392  std::cout << " clockRate: " << deviceProp.clockRate << std::endl;
393  std::cout << " canMapHostMemory: " << deviceProp.canMapHostMemory << std::endl;
394  std::cout << " computeMode: " << deviceProp.computeMode << std::endl;
395 }
396 
397 #endif // EIGEN_GPUCC
398 
413 template <typename Kernel, typename... Args>
414 auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
415 #ifdef EIGEN_GPUCC
416  return run_on_gpu(kernel, std::forward<Args>(args)...);
417 #else
418  return run_on_cpu(kernel, std::forward<Args>(args)...);
419 #endif
420 }
421 
436 template <typename Kernel, typename... Args>
437 auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
438 #ifdef EIGEN_GPUCC
439  return run_on_gpu_with_hint(buffer_capacity_hint, kernel, std::forward<Args>(args)...);
440 #else
441  EIGEN_UNUSED_VARIABLE(buffer_capacity_hint)
442  return run_on_cpu(kernel, std::forward<Args>(args)...);
443 #endif
444 }
445 
446 } // namespace Eigen
447 
448 #endif // GPU_TEST_HELPER_H
#define EIGEN_CUDA_SDK_VER
Definition: Macros.h:542
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:966
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition: Macros.h:576
#define EIGEN_COMP_NVCC
Definition: Macros.h:143
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
@ N
Definition: constructor.cpp:22
void run_on_cpu(const Kernel &ker, int n, const Input &in, Output &out)
Definition: gpu_common.h:20
void run_on_gpu(const Kernel &ker, int n, const Input &in, Output &out)
Definition: gpu_common.h:34
return int(ret)+1
int info
Definition: level2_cplx_impl.h:39
func(actual_m, actual_n, a, *lda, actual_b, 1, actual_c, 1, alpha)
#define VERIFY(a)
Definition: main.h:362
TupleImpl< sizeof...(Types), Types... > tuple
Definition: Tuple.h:267
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const tuple_get_impl< Idx, Types... >::ReturnType & get(const TupleImpl< sizeof...(Types), Types... > &tuple)
Definition: Tuple.h:214
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType tie(Args &... args) EIGEN_NOEXCEPT
Definition: Tuple.h:242
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType make_tuple(Args &&... args)
Definition: Tuple.h:250
typename extract_output_indices_helper< sizeof...(Types), 0, std::index_sequence<>, Types... >::type extract_output_indices
Definition: gpu_test_helper.h:80
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_serialized(Kernel kernel, uint8_t *buffer, size_t capacity)
Definition: gpu_test_helper.h:166
std::uint8_t uint8_t
Definition: Meta.h:36
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
squared absolute value
Definition: GlobalFunctions.h:87
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const uint8_t * deserialize(const uint8_t *src, const uint8_t *end, Args &... args)
Definition: Serializer.h:201
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t serialize_size(const Args &... args)
Definition: Serializer.h:175
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint8_t * serialize(uint8_t *dest, uint8_t *end, const Args &... args)
Definition: Serializer.h:188
auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args &&... args) -> decltype(kernel(args...))
Definition: gpu_test_helper.h:437
val
Definition: calibrate.py:119
args
Definition: compute_granudrum_aor.py:143
type
Definition: compute_granudrum_aor.py:141
Definition: Eigen_Colamd.h:49
Container::iterator get(Container &c, Position position)
Definition: stdlist_overload.cpp:29
std::index_sequence< OutputIndices... > type
Definition: gpu_test_helper.h:73
typename extract_output_indices_helper< N - 1, Idx+1, typename std::conditional< std::is_lvalue_reference< T1 >::value &&!std::is_const< std::remove_reference_t< T1 > >::value, std::index_sequence< OutputIndices..., Idx >, std::index_sequence< OutputIndices... > >::type, Ts... >::type type
Definition: gpu_test_helper.h:67
Definition: gpu_test_helper.h:46
Definition: MoreMeta.h:202
Definition: gpu_test_helper.h:84
Definition: gpu_test_helper.h:83
typename std::conditional< std::is_same< T, void >::value, Void, T >::type ReturnType
Definition: gpu_test_helper.h:88
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func &&func, Args &&... args) -> std::enable_if_t<!std::is_same< decltype(func(args...)), void >::value, decltype(func(args...))>
Definition: gpu_test_helper.h:92
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC std::enable_if_t<!std::is_same< typename std::decay< T >::type, Void >::value, T > restore(T &&val)
Definition: gpu_test_helper.h:109
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void restore(const Void &)
Definition: gpu_test_helper.h:115
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func &&func, Args &&... args) -> std::enable_if_t< std::is_same< decltype(func(args...)), void >::value, Void >
Definition: gpu_test_helper.h:99
Definition: benchGeometry.cpp:21
void run(const string &dir_name, LinearSolver *linear_solver_pt, const unsigned nel_1d, bool mess_up_order)
Definition: two_d_poisson_compare_solvers.cc:317