1 #ifndef GPU_TEST_HELPER_H
2 #define GPU_TEST_HELPER_H
12 #if defined(EIGEN_GPU_COMPILE_PHASE) || (defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER < 92000)
13 #define EIGEN_USE_CUSTOM_TUPLE 1
15 #define EIGEN_USE_CUSTOM_TUPLE 0
18 #if EIGEN_USE_CUSTOM_TUPLE
19 #include "../Eigen/src/Core/arch/GPU/Tuple.h"
29 namespace test_detail {
31 #if !EIGEN_USE_CUSTOM_TUPLE
42 #undef EIGEN_USE_CUSTOM_TUPLE
45 template <
size_t N,
size_t Idx,
typename OutputIndexSequence,
typename... Ts>
59 template <
size_t N,
size_t Idx,
size_t... OutputIndices,
typename T1,
typename... Ts>
63 typename std::conditional<
66 std::index_sequence<OutputIndices..., Idx>, std::index_sequence<OutputIndices...>>
::type,
71 template <
size_t Idx,
size_t... OutputIndices>
73 using type = std::index_sequence<OutputIndices...>;
78 template <
typename... Types>
91 template <
typename Func,
typename... Args>
94 return func(std::forward<Args>(
args)...);
98 template <
typename Func,
typename... Args>
106 template <
typename T>
114 template <
typename T =
void>
126 template <
typename Kernel,
typename... Args,
size_t... Indices,
size_t... OutputIndices>
134 const uint8_t* read_ptr = buffer;
135 const uint8_t* read_end = buffer + capacity;
154 uint8_t* write_end = buffer + capacity;
158 if (output_size <= capacity) {
165 template <
typename Kernel,
typename... Args>
168 kernel, buffer, capacity);
174 #define GPU_CHECK(expr) \
176 gpuError_t err = expr; \
177 if (err != gpuSuccess) { \
178 printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err)); \
184 template <
typename Kernel,
typename... Args>
194 template <
typename Kernel,
typename... Args,
size_t... Indices,
size_t... OutputIndices>
195 auto run_serialized_on_gpu(
size_t buffer_capacity_hint, std::index_sequence<Indices...>,
196 std::index_sequence<OutputIndices...>,
Kernel kernel, Args&&...
args)
197 -> decltype(kernel(
args...)) {
204 if (buffer_capacity_hint == 0) {
206 capacity =
sizeof(size_t);
207 while (capacity <= input_data_size) {
214 capacity = std::max<size_t>(buffer_capacity_hint +
sizeof(
size_t), input_data_size);
216 std::vector<uint8_t> buffer(capacity);
219 uint8_t* host_data_end =
nullptr;
221 uint8_t* device_data =
nullptr;
222 size_t output_data_size = 0;
225 capacity = std::max<size_t>(capacity, output_data_size);
226 buffer.resize(capacity);
227 host_data = buffer.data();
228 host_data_end = buffer.data() + capacity;
233 gpuMalloc((
void**)(&device_data), capacity);
234 gpuMemcpy(device_data, buffer.data(), input_data_size, gpuMemcpyHostToDevice);
235 GPU_CHECK(gpuDeviceSynchronize());
239 hipLaunchKernelGGL(HIP_KERNEL_NAME(run_serialized_on_gpu_meta_kernel<Kernel, Args...>), 1, 1, 0, 0, kernel,
240 device_data, capacity);
242 run_serialized_on_gpu_meta_kernel<
Kernel, Args...><<<1, 1>>>(kernel, device_data, capacity);
245 GPU_CHECK(gpuGetLastError());
246 GPU_CHECK(gpuDeviceSynchronize());
248 gpuMemcpy(host_data, device_data, capacity, gpuMemcpyDeviceToHost);
249 gpuFree(device_data);
250 GPU_CHECK(gpuDeviceSynchronize());
255 if (output_data_size > capacity) {
256 std::cerr <<
"The serialized output does not fit in the output buffer, " << output_data_size <<
" vs capacity "
257 << capacity <<
"." << std::endl
258 <<
"Try specifying a minimum buffer capacity: " << std::endl
259 <<
" run_with_hint(" << output_data_size <<
", ...)" << std::endl;
266 c_host_ptr =
Eigen::deserialize(c_host_ptr, host_data_end, test_detail::get<OutputIndices, Args&...>(args_tuple)...);
269 typename void_helper::ReturnType<decltype(kernel(
args...))> result;
271 return void_helper::restore(result);
284 template <
typename Kernel,
typename... Args>
286 return kernel(std::forward<Args>(
args)...);
303 template <
typename Kernel,
typename... Args>
305 return internal::run_serialized_on_gpu<
Kernel, Args...>(
306 0, std::make_index_sequence<
sizeof...(Args)>{},
324 template <
typename Kernel,
typename... Args>
325 auto run_on_gpu_with_hint(
size_t buffer_capacity_hint,
Kernel kernel, Args&&...
args) -> decltype(kernel(
args...)) {
326 return internal::run_serialized_on_gpu<
Kernel, Args...>(
328 kernel, std::forward<Args>(
args)...);
335 struct CompileTimeDeviceInfoKernel {
342 Info
info = {-1, -1};
343 #if defined(__CUDA_ARCH__)
344 info.cuda =
static_cast<int>(__CUDA_ARCH__ + 0);
346 #if defined(EIGEN_HIP_DEVICE_COMPILE)
347 info.hip =
static_cast<int>(EIGEN_HIP_DEVICE_COMPILE + 0);
356 void print_gpu_device_info() {
358 gpuDeviceProp_t deviceProp;
359 gpuGetDeviceProperties(&deviceProp, device);
363 std::cout <<
"GPU compile-time info:\n";
366 std::cout <<
" EIGEN_CUDACC: " <<
int(EIGEN_CUDACC) << std::endl;
369 #ifdef EIGEN_CUDA_SDK_VER
378 std::cout <<
" EIGEN_HIPCC: " <<
int(EIGEN_HIPCC) << std::endl;
381 std::cout <<
" EIGEN_CUDA_ARCH: " <<
info.cuda << std::endl;
382 std::cout <<
" EIGEN_HIP_DEVICE_COMPILE: " <<
info.hip << std::endl;
384 std::cout <<
"GPU device info:\n";
385 std::cout <<
" name: " << deviceProp.name << std::endl;
386 std::cout <<
" capability: " << deviceProp.major <<
"." << deviceProp.minor << std::endl;
387 std::cout <<
" multiProcessorCount: " << deviceProp.multiProcessorCount << std::endl;
388 std::cout <<
" maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << std::endl;
389 std::cout <<
" warpSize: " << deviceProp.warpSize << std::endl;
390 std::cout <<
" regsPerBlock: " << deviceProp.regsPerBlock << std::endl;
391 std::cout <<
" concurrentKernels: " << deviceProp.concurrentKernels << std::endl;
392 std::cout <<
" clockRate: " << deviceProp.clockRate << std::endl;
393 std::cout <<
" canMapHostMemory: " << deviceProp.canMapHostMemory << std::endl;
394 std::cout <<
" computeMode: " << deviceProp.computeMode << std::endl;
413 template <
typename Kernel,
typename... Args>
436 template <
typename Kernel,
typename... Args>
439 return run_on_gpu_with_hint(buffer_capacity_hint, kernel, std::forward<Args>(
args)...);
#define EIGEN_CUDA_SDK_VER
Definition: Macros.h:542
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:966
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition: Macros.h:576
#define EIGEN_COMP_NVCC
Definition: Macros.h:143
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
@ N
Definition: constructor.cpp:22
void run_on_cpu(const Kernel &ker, int n, const Input &in, Output &out)
Definition: gpu_common.h:20
void run_on_gpu(const Kernel &ker, int n, const Input &in, Output &out)
Definition: gpu_common.h:34
int info
Definition: level2_cplx_impl.h:39
func(actual_m, actual_n, a, *lda, actual_b, 1, actual_c, 1, alpha)
#define VERIFY(a)
Definition: main.h:362
TupleImpl< sizeof...(Types), Types... > tuple
Definition: Tuple.h:267
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const tuple_get_impl< Idx, Types... >::ReturnType & get(const TupleImpl< sizeof...(Types), Types... > &tuple)
Definition: Tuple.h:214
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType tie(Args &... args) EIGEN_NOEXCEPT
Definition: Tuple.h:242
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType make_tuple(Args &&... args)
Definition: Tuple.h:250
typename extract_output_indices_helper< sizeof...(Types), 0, std::index_sequence<>, Types... >::type extract_output_indices
Definition: gpu_test_helper.h:80
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_serialized(Kernel kernel, uint8_t *buffer, size_t capacity)
Definition: gpu_test_helper.h:166
std::uint8_t uint8_t
Definition: Meta.h:36
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
squared absolute value
Definition: GlobalFunctions.h:87
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const uint8_t * deserialize(const uint8_t *src, const uint8_t *end, Args &... args)
Definition: Serializer.h:201
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t serialize_size(const Args &... args)
Definition: Serializer.h:175
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint8_t * serialize(uint8_t *dest, uint8_t *end, const Args &... args)
Definition: Serializer.h:188
auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args &&... args) -> decltype(kernel(args...))
Definition: gpu_test_helper.h:437
val
Definition: calibrate.py:119
args
Definition: compute_granudrum_aor.py:143
type
Definition: compute_granudrum_aor.py:141
Definition: Eigen_Colamd.h:49
Container::iterator get(Container &c, Position position)
Definition: stdlist_overload.cpp:29
Definition: MoreMeta.h:202
Definition: gpu_test_helper.h:84
Definition: gpu_test_helper.h:83
typename std::conditional< std::is_same< T, void >::value, Void, T >::type ReturnType
Definition: gpu_test_helper.h:88
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func &&func, Args &&... args) -> std::enable_if_t<!std::is_same< decltype(func(args...)), void >::value, decltype(func(args...))>
Definition: gpu_test_helper.h:92
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC std::enable_if_t<!std::is_same< typename std::decay< T >::type, Void >::value, T > restore(T &&val)
Definition: gpu_test_helper.h:109
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void restore(const Void &)
Definition: gpu_test_helper.h:115
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func &&func, Args &&... args) -> std::enable_if_t< std::is_same< decltype(func(args...)), void >::value, Void >
Definition: gpu_test_helper.h:99
Definition: benchGeometry.cpp:21
void run(const string &dir_name, LinearSolver *linear_solver_pt, const unsigned nel_1d, bool mess_up_order)
Definition: two_d_poisson_compare_solvers.cc:317