37 std::ptrdiff_t in_bytes = in.size() *
sizeof(
typename Input::Scalar);
40 gpuMalloc((
void**)(&d_in), in_bytes);
41 gpuMalloc((
void**)(&d_out), out_bytes);
43 gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice);
44 gpuMemcpy(d_out,
out.data(), out_bytes, gpuMemcpyHostToDevice);
49 dim3 Grids((
n +
int(Blocks.x) - 1) /
int(Blocks.x));
51 gpuDeviceSynchronize();
55 typename std::decay<decltype(*d_out)>::
type>),
56 dim3(Grids), dim3(Blocks), 0, 0, ker,
n, d_in, d_out);
60 run_on_gpu_meta_kernel<<<Grids, Blocks>>>(ker,
n, d_in, d_out);
64 gpuError_t err = gpuGetLastError();
65 if (err != gpuSuccess) {
66 printf(
"%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
71 err = gpuDeviceSynchronize();
72 if (err != gpuSuccess) {
73 printf(
"%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
78 gpuMemcpy(
const_cast<typename
Input::Scalar*
>(in.data()), d_in, in_bytes, gpuMemcpyDeviceToHost);
79 gpuMemcpy(
out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost);
SCALAR Scalar
Definition: bench_gemm.cpp:45
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void run_on_gpu_meta_kernel(const Kernel ker, int n, const Input *in, Output *out)
Definition: gpu_common.h:25
type
Definition: compute_granudrum_aor.py:141