d7/d43/gpu__test__helper_8h_source.html

 #ifndef GPU_TEST_HELPER_H

 #define GPU_TEST_HELPER_H


 #include <Eigen/Core>


 // Allow gpu** macros for generic tests.

 #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>


 // std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that

 // doesn't allow std::tuple to compile for host code either. In these cases,

 // use our custom implementation.

 #if defined(EIGEN_GPU_COMPILE_PHASE) || (defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER < 92000)

 #define EIGEN_USE_CUSTOM_TUPLE 1

 #else

 #define EIGEN_USE_CUSTOM_TUPLE 0

 #endif


 #if EIGEN_USE_CUSTOM_TUPLE

 #include "../Eigen/src/Core/arch/GPU/Tuple.h"

 #else

 #include <tuple>

 #endif

 namespace Eigen {


 namespace internal {


 // Note: cannot re-use tuple_impl, since that will cause havoc for

 // tuple_test.

 namespace test_detail {

 // Use std::tuple on CPU, otherwise use the GPU-specific versions.

 #if !EIGEN_USE_CUSTOM_TUPLE

 using std::get;

 using std::make_tuple;

 using std::tie;

 using std::tuple;

 #else

 using tuple_impl::get;

 using tuple_impl::make_tuple;

 using tuple_impl::tie;

 using tuple_impl::tuple;

 #endif

 #undef EIGEN_USE_CUSTOM_TUPLE

 }  // namespace test_detail


 template <size_t N, size_t Idx, typename OutputIndexSequence, typename... Ts>

 struct extract_output_indices_helper;


 template <size_t N, size_t Idx, size_t... OutputIndices, typename T1, typename... Ts>

 struct extract_output_indices_helper<N, Idx, std::index_sequence<OutputIndices...>, T1, Ts...> {

   using type = typename extract_output_indices_helper<

       N - 1, Idx + 1,

       typename std::conditional<

           // If is a non-const l-value reference, append index.

           std::is_lvalue_reference<T1>::value && !std::is_const<std::remove_reference_t<T1>>::value,

           std::index_sequence<OutputIndices..., Idx>, std::index_sequence<OutputIndices...>>::type,

       Ts...>::type;

 };


 // Base case.

 template <size_t Idx, size_t... OutputIndices>

 struct extract_output_indices_helper<0, Idx, std::index_sequence<OutputIndices...>> {

   using type = std::index_sequence<OutputIndices...>;

 };


 // Extracts a set of indices into Types... that correspond to non-const

 // l-value references.

 template <typename... Types>

 using extract_output_indices =

     typename extract_output_indices_helper<sizeof...(Types), 0, std::index_sequence<>, Types...>::type;


 // Helper struct for dealing with Generic functors that may return void.

 struct void_helper {

   struct Void {};


   // Converts void -> Void, T otherwise.

   template <typename T>

   using ReturnType = typename std::conditional<std::is_same<T, void>::value, Void, T>::type;


   // Non-void return value.

   template <typename Func, typename... Args>

   static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args)

       -> std::enable_if_t<!std::is_same<decltype(func(args...)), void>::value, decltype(func(args...))> {

     return func(std::forward<Args>(args)...);

   }


   // Void return value.

   template <typename Func, typename... Args>

   static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args)

       -> std::enable_if_t<std::is_same<decltype(func(args...)), void>::value, Void> {

     func(std::forward<Args>(args)...);

     return Void{};

   }


   // Restores the original return type, Void -> void, T otherwise.

   template <typename T>

   static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC

       std::enable_if_t<!std::is_same<typename std::decay<T>::type, Void>::value, T>

       restore(T&& val) {

     return val;

   }


   // Void case.

   template <typename T = void>

   static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void restore(const Void&) {}

 };


 // Runs a kernel via serialized buffer.  Does this by deserializing the buffer

 // to construct the arguments, calling the kernel, then re-serialing the outputs.

 // The buffer contains

 //     [ input_buffer_size, args ]

 // After the kernel call, it is then populated with

 //     [ output_buffer_size, output_parameters, return_value ]

 // If the output_buffer_size exceeds the buffer's capacity, then only the

 // output_buffer_size is populated.

 template <typename Kernel, typename... Args, size_t... Indices, size_t... OutputIndices>

 EIGEN_DEVICE_FUNC void run_serialized(std::index_sequence<Indices...>, std::index_sequence<OutputIndices...>,

                                       Kernel kernel, uint8_t* buffer, size_t capacity) {

   using test_detail::get;

   using test_detail::make_tuple;

   using test_detail::tuple;

   // Deserialize input size and inputs.

   size_t input_size;

   const uint8_t* read_ptr = buffer;

   const uint8_t* read_end = buffer + capacity;

   read_ptr = Eigen::deserialize(read_ptr, read_end, input_size);

   // Create value-type instances to populate.

   auto args = make_tuple(typename std::decay<Args>::type{}...);

   EIGEN_UNUSED_VARIABLE(args)  // Avoid NVCC compile warning.

   // NVCC 9.1 requires us to spell out the template parameters explicitly.

   read_ptr = Eigen::deserialize(read_ptr, read_end, get<Indices, typename std::decay<Args>::type...>(args)...);


   // Call function, with void->Void conversion so we are guaranteed a complete

   // output type.

   auto result = void_helper::call(kernel, get<Indices, typename std::decay<Args>::type...>(args)...);


   // Determine required output size.

   size_t output_size = Eigen::serialize_size(capacity);

   output_size += Eigen::serialize_size(get<OutputIndices, typename std::decay<Args>::type...>(args)...);

   output_size += Eigen::serialize_size(result);


   // Always serialize required buffer size.

   uint8_t* write_ptr = buffer;

   uint8_t* write_end = buffer + capacity;

   write_ptr = Eigen::serialize(write_ptr, write_end, output_size);

   // Null `write_ptr` can be safely passed along.

   // Serialize outputs if they fit in the buffer.

   if (output_size <= capacity) {

     // Collect outputs and result.

     write_ptr = Eigen::serialize(write_ptr, write_end, get<OutputIndices, typename std::decay<Args>::type...>(args)...);

     write_ptr = Eigen::serialize(write_ptr, write_end, result);

   }

 }


 template <typename Kernel, typename... Args>

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_serialized(Kernel kernel, uint8_t* buffer, size_t capacity) {

   run_serialized<Kernel, Args...>(std::make_index_sequence<sizeof...(Args)>{}, extract_output_indices<Args...>{},

                                   kernel, buffer, capacity);

 }


 #ifdef EIGEN_GPUCC


 // Checks for GPU errors and asserts / prints the error message.

 #define GPU_CHECK(expr)                                                 \

   do {                                                                  \

     gpuError_t err = expr;                                              \

     if (err != gpuSuccess) {                                            \

       printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err)); \

       gpu_assert(false);                                                \

     }                                                                   \

   } while (0)


 // Calls run_serialized on the GPU.

 template <typename Kernel, typename... Args>

 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void run_serialized_on_gpu_meta_kernel(const Kernel kernel, uint8_t* buffer,

                                                                                size_t capacity) {

   run_serialized<Kernel, Args...>(kernel, buffer, capacity);

 }


 // Runs kernel(args...) on the GPU via the serialization mechanism.

 //

 // Note: this may end up calling the kernel multiple times if the initial output

 // buffer is not large enough to hold the outputs.

 template <typename Kernel, typename... Args, size_t... Indices, size_t... OutputIndices>

 auto run_serialized_on_gpu(size_t buffer_capacity_hint, std::index_sequence<Indices...>,

                            std::index_sequence<OutputIndices...>, Kernel kernel, Args&&... args)

     -> decltype(kernel(args...)) {

   // Compute the required serialization buffer capacity.

   // Round up input size to next power of two to give a little extra room

   // for outputs.

   size_t input_data_size = sizeof(size_t) + Eigen::serialize_size(args...);


   size_t capacity;

   if (buffer_capacity_hint == 0) {

     // Estimate as the power of two larger than the total input size.

     capacity = sizeof(size_t);

     while (capacity <= input_data_size) {

       capacity *= 2;

     }

   } else {

     // Use the larger of the hint and the total input size.

     // Add sizeof(size_t) to the hint to account for storing the buffer capacity

     // itself so the user doesn't need to think about this.

     capacity = std::max<size_t>(buffer_capacity_hint + sizeof(size_t), input_data_size);

   }

   std::vector<uint8_t> buffer(capacity);


   uint8_t* host_data = nullptr;

   uint8_t* host_data_end = nullptr;

   uint8_t* host_ptr = nullptr;

   uint8_t* device_data = nullptr;

   size_t output_data_size = 0;


   // Allocate buffers and copy input data.

   capacity = std::max<size_t>(capacity, output_data_size);

   buffer.resize(capacity);

   host_data = buffer.data();

   host_data_end = buffer.data() + capacity;

   host_ptr = Eigen::serialize(host_data, host_data_end, input_data_size);

   host_ptr = Eigen::serialize(host_ptr, host_data_end, args...);


   // Copy inputs to host.

   gpuMalloc((void**)(&device_data), capacity);

   gpuMemcpy(device_data, buffer.data(), input_data_size, gpuMemcpyHostToDevice);

   GPU_CHECK(gpuDeviceSynchronize());


 // Run kernel.

 #ifdef EIGEN_USE_HIP

   hipLaunchKernelGGL(HIP_KERNEL_NAME(run_serialized_on_gpu_meta_kernel<Kernel, Args...>), 1, 1, 0, 0, kernel,

                      device_data, capacity);

 #else

   run_serialized_on_gpu_meta_kernel<Kernel, Args...><<<1, 1>>>(kernel, device_data, capacity);

 #endif

   // Check pre-launch and kernel execution errors.

   GPU_CHECK(gpuGetLastError());

   GPU_CHECK(gpuDeviceSynchronize());

   // Copy back new output to host.

   gpuMemcpy(host_data, device_data, capacity, gpuMemcpyDeviceToHost);

   gpuFree(device_data);

   GPU_CHECK(gpuDeviceSynchronize());


   // Determine output buffer size.

   const uint8_t* c_host_ptr = Eigen::deserialize(host_data, host_data_end, output_data_size);

   // If the output doesn't fit in the buffer, spit out warning and fail.

   if (output_data_size > capacity) {

     std::cerr << "The serialized output does not fit in the output buffer, " << output_data_size << " vs capacity "

               << capacity << "." << std::endl

               << "Try specifying a minimum buffer capacity: " << std::endl

               << "  run_with_hint(" << output_data_size << ", ...)" << std::endl;

     VERIFY(false);

   }


   // Deserialize outputs.

   auto args_tuple = test_detail::tie(args...);

   EIGEN_UNUSED_VARIABLE(args_tuple)  // Avoid NVCC compile warning.

   c_host_ptr = Eigen::deserialize(c_host_ptr, host_data_end, test_detail::get<OutputIndices, Args&...>(args_tuple)...);


   // Maybe deserialize return value, properly handling void.

   typename void_helper::ReturnType<decltype(kernel(args...))> result;

   c_host_ptr = Eigen::deserialize(c_host_ptr, host_data_end, result);

   return void_helper::restore(result);

 }


 #endif  // EIGEN_GPUCC


 }  // namespace internal


 template <typename Kernel, typename... Args>

 auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {

   return kernel(std::forward<Args>(args)...);

 }


 #ifdef EIGEN_GPUCC


 template <typename Kernel, typename... Args>

 auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {

   return internal::run_serialized_on_gpu<Kernel, Args...>(

       /*buffer_capacity_hint=*/0, std::make_index_sequence<sizeof...(Args)>{},

       internal::extract_output_indices<Args...>{}, kernel, std::forward<Args>(args)...);

 }


 template <typename Kernel, typename... Args>

 auto run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {

   return internal::run_serialized_on_gpu<Kernel, Args...>(

       buffer_capacity_hint, std::make_index_sequence<sizeof...(Args)>{}, internal::extract_output_indices<Args...>{},

       kernel, std::forward<Args>(args)...);

 }


 struct CompileTimeDeviceInfoKernel {

   struct Info {

     int cuda;

     int hip;

   };


   EIGEN_DEVICE_FUNC Info operator()() const {

     Info info = {-1, -1};

 #if defined(__CUDA_ARCH__)

     info.cuda = static_cast<int>(__CUDA_ARCH__ + 0);

 #endif

 #if defined(EIGEN_HIP_DEVICE_COMPILE)

     info.hip = static_cast<int>(EIGEN_HIP_DEVICE_COMPILE + 0);

 #endif

     return info;

   }

 };


 void print_gpu_device_info() {

   int device = 0;

   gpuDeviceProp_t deviceProp;

   gpuGetDeviceProperties(&deviceProp, device);


   auto info = run_on_gpu(CompileTimeDeviceInfoKernel());


   std::cout << "GPU compile-time info:\n";


 #ifdef EIGEN_CUDACC

   std::cout << "  EIGEN_CUDACC:                " << int(EIGEN_CUDACC) << std::endl;

 #endif


 #ifdef EIGEN_CUDA_SDK_VER

   std::cout << "  EIGEN_CUDA_SDK_VER:          " << int(EIGEN_CUDA_SDK_VER) << std::endl;

 #endif


 #if EIGEN_COMP_NVCC

   std::cout << "  EIGEN_COMP_NVCC:             " << int(EIGEN_COMP_NVCC) << std::endl;

 #endif


 #ifdef EIGEN_HIPCC

   std::cout << "  EIGEN_HIPCC:                 " << int(EIGEN_HIPCC) << std::endl;

 #endif


   std::cout << "  EIGEN_CUDA_ARCH:             " << info.cuda << std::endl;

   std::cout << "  EIGEN_HIP_DEVICE_COMPILE:    " << info.hip << std::endl;


   std::cout << "GPU device info:\n";

   std::cout << "  name:                        " << deviceProp.name << std::endl;

   std::cout << "  capability:                  " << deviceProp.major << "." << deviceProp.minor << std::endl;

   std::cout << "  multiProcessorCount:         " << deviceProp.multiProcessorCount << std::endl;

   std::cout << "  maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << std::endl;

   std::cout << "  warpSize:                    " << deviceProp.warpSize << std::endl;

   std::cout << "  regsPerBlock:                " << deviceProp.regsPerBlock << std::endl;

   std::cout << "  concurrentKernels:           " << deviceProp.concurrentKernels << std::endl;

   std::cout << "  clockRate:                   " << deviceProp.clockRate << std::endl;

   std::cout << "  canMapHostMemory:            " << deviceProp.canMapHostMemory << std::endl;

   std::cout << "  computeMode:                 " << deviceProp.computeMode << std::endl;

 }


 #endif  // EIGEN_GPUCC


 template <typename Kernel, typename... Args>

 auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {

 #ifdef EIGEN_GPUCC

   return run_on_gpu(kernel, std::forward<Args>(args)...);

 #else

   return run_on_cpu(kernel, std::forward<Args>(args)...);

 #endif

 }


 template <typename Kernel, typename... Args>

 auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {

 #ifdef EIGEN_GPUCC

   return run_on_gpu_with_hint(buffer_capacity_hint, kernel, std::forward<Args>(args)...);

 #else

   EIGEN_UNUSED_VARIABLE(buffer_capacity_hint)

   return run_on_cpu(kernel, std::forward<Args>(args)...);

 #endif

 }


 }  // namespace Eigen


 #endif  // GPU_TEST_HELPER_H

EIGEN_CUDA_SDK_VER
#define EIGEN_CUDA_SDK_VER
Definition: Macros.h:542

EIGEN_ALWAYS_INLINE
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845

EIGEN_UNUSED_VARIABLE
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:966

EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892

EIGEN_HIP_LAUNCH_BOUNDS_1024
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition: Macros.h:576

EIGEN_COMP_NVCC
#define EIGEN_COMP_NVCC
Definition: Macros.h:143

EIGEN_STRONG_INLINE
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834

TensorGpuHipCudaDefines.h

Eigen::Triplet< double >

Kernel

N
@ N
Definition: constructor.cpp:22

run_on_cpu
void run_on_cpu(const Kernel &ker, int n, const Input &in, Output &out)
Definition: gpu_common.h:20

run_on_gpu
void run_on_gpu(const Kernel &ker, int n, const Input &in, Output &out)
Definition: gpu_common.h:34

int
return int(ret)+1

info
int info
Definition: level2_cplx_impl.h:39

func
func(actual_m, actual_n, a, *lda, actual_b, 1, actual_c, 1, alpha)

VERIFY
#define VERIFY(a)
Definition: main.h:362

Eigen::internal::tuple_impl::tuple
TupleImpl< sizeof...(Types), Types... > tuple
Definition: Tuple.h:267

Eigen::internal::tuple_impl::get
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const tuple_get_impl< Idx, Types... >::ReturnType & get(const TupleImpl< sizeof...(Types), Types... > &tuple)
Definition: Tuple.h:214

Eigen::internal::tuple_impl::tie
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType tie(Args &... args) EIGEN_NOEXCEPT
Definition: Tuple.h:242

Eigen::internal::tuple_impl::make_tuple
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType make_tuple(Args &&... args)
Definition: Tuple.h:250

Eigen::internal::extract_output_indices
typename extract_output_indices_helper< sizeof...(Types), 0, std::index_sequence<>, Types... >::type extract_output_indices
Definition: gpu_test_helper.h:80

Eigen::internal::run_serialized
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_serialized(Kernel kernel, uint8_t *buffer, size_t capacity)
Definition: gpu_test_helper.h:166

Eigen::numext::uint8_t
std::uint8_t uint8_t
Definition: Meta.h:36

Eigen
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70

Eigen::value
squared absolute value
Definition: GlobalFunctions.h:87

Eigen::deserialize
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const uint8_t * deserialize(const uint8_t *src, const uint8_t *end, Args &... args)
Definition: Serializer.h:201

Eigen::serialize_size
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t serialize_size(const Args &... args)
Definition: Serializer.h:175

Eigen::serialize
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint8_t * serialize(uint8_t *dest, uint8_t *end, const Args &... args)
Definition: Serializer.h:188

Eigen::run_with_hint
auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args &&... args) -> decltype(kernel(args...))
Definition: gpu_test_helper.h:437

calibrate.val
val
Definition: calibrate.py:119

compute_granudrum_aor.args
args
Definition: compute_granudrum_aor.py:143

compute_granudrum_aor.type
type
Definition: compute_granudrum_aor.py:141

internal
Definition: Eigen_Colamd.h:49

get
Container::iterator get(Container &c, Position position)
Definition: stdlist_overload.cpp:29

Eigen::internal::extract_output_indices_helper< 0, Idx, std::index_sequence< OutputIndices... > >::type
std::index_sequence< OutputIndices... > type
Definition: gpu_test_helper.h:73

Eigen::internal::extract_output_indices_helper< N, Idx, std::index_sequence< OutputIndices... >, T1, Ts... >::type
typename extract_output_indices_helper< N - 1, Idx+1, typename std::conditional< std::is_lvalue_reference< T1 >::value &&!std::is_const< std::remove_reference_t< T1 > >::value, std::index_sequence< OutputIndices..., Idx >, std::index_sequence< OutputIndices... > >::type, Ts... >::type type
Definition: gpu_test_helper.h:67

Eigen::internal::extract_output_indices_helper
Definition: gpu_test_helper.h:46

Eigen::internal::get
Definition: MoreMeta.h:202

Eigen::internal::void_helper::Void
Definition: gpu_test_helper.h:84

Eigen::internal::void_helper
Definition: gpu_test_helper.h:83

Eigen::internal::void_helper::ReturnType
typename std::conditional< std::is_same< T, void >::value, Void, T >::type ReturnType
Definition: gpu_test_helper.h:88

Eigen::internal::void_helper::call
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func &&func, Args &&... args) -> std::enable_if_t<!std::is_same< decltype(func(args...)), void >::value, decltype(func(args...))>
Definition: gpu_test_helper.h:92

Eigen::internal::void_helper::restore
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC std::enable_if_t<!std::is_same< typename std::decay< T >::type, Void >::value, T > restore(T &&val)
Definition: gpu_test_helper.h:109

Eigen::internal::void_helper::restore
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void restore(const Void &)
Definition: gpu_test_helper.h:115

Eigen::internal::void_helper::call
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func &&func, Args &&... args) -> std::enable_if_t< std::is_same< decltype(func(args...)), void >::value, Void >
Definition: gpu_test_helper.h:99

func
Definition: benchGeometry.cpp:21

run
void run(const string &dir_name, LinearSolver *linear_solver_pt, const unsigned nel_1d, bool mess_up_order)
Definition: two_d_poisson_compare_solvers.cc:317