10 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
11 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
19 #include "./InternalHeaderCheck.h"
23 static const int kGpuScratchSize = 1024;
27 class StreamInterface {
29 virtual ~StreamInterface() {}
31 virtual const gpuStream_t& stream()
const = 0;
32 virtual const gpuDeviceProp_t& deviceProperties()
const = 0;
35 virtual void* allocate(
size_t num_bytes)
const = 0;
36 virtual void deallocate(
void* buffer)
const = 0;
39 virtual void* scratchpad()
const = 0;
45 virtual unsigned int* semaphore()
const = 0;
48 class GpuDeviceProperties {
50 GpuDeviceProperties() : initialized_(false), first_(true), device_properties_(nullptr) {}
52 ~GpuDeviceProperties() {
53 if (device_properties_) {
54 delete[] device_properties_;
70 if (first_.exchange(
false)) {
73 gpuError_t status = gpuGetDeviceCount(&num_devices);
74 if (status != gpuSuccess) {
75 std::cerr <<
"Failed to get the number of GPU devices: " << gpuGetErrorString(status) << std::endl;
76 gpu_assert(status == gpuSuccess);
78 device_properties_ =
new gpuDeviceProp_t[num_devices];
79 for (
int i = 0;
i < num_devices; ++
i) {
80 status = gpuGetDeviceProperties(&device_properties_[
i],
i);
81 if (status != gpuSuccess) {
82 std::cerr <<
"Failed to initialize GPU device #" <<
i <<
": " << gpuGetErrorString(status) << std::endl;
83 gpu_assert(status == gpuSuccess);
87 std::atomic_thread_fence(std::memory_order_release);
91 while (!initialized_) {
92 std::atomic_thread_fence(std::memory_order_acquire);
93 std::this_thread::sleep_for(std::chrono::milliseconds(1000));
100 volatile bool initialized_;
101 std::atomic<bool> first_;
102 gpuDeviceProp_t* device_properties_;
106 static GpuDeviceProperties* deviceProperties =
new GpuDeviceProperties();
107 if (!deviceProperties->isInitialized()) {
108 deviceProperties->initialize();
110 return *deviceProperties;
114 return GetGpuDeviceProperties().get(device);
117 static const gpuStream_t default_stream = gpuStreamDefault;
119 class GpuStreamDevice :
public StreamInterface {
122 GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
123 gpuError_t status = gpuGetDevice(&device_);
124 if (status != gpuSuccess) {
125 std::cerr <<
"Failed to get the GPU devices " << gpuGetErrorString(status) << std::endl;
126 gpu_assert(status == gpuSuccess);
130 GpuStreamDevice(
int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
135 GpuStreamDevice(
const gpuStream_t* stream,
int device = -1)
136 : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
138 gpuError_t status = gpuGetDevice(&device_);
139 if (status != gpuSuccess) {
140 std::cerr <<
"Failed to get the GPU devices " << gpuGetErrorString(status) << std::endl;
141 gpu_assert(status == gpuSuccess);
145 gpuError_t err = gpuGetDeviceCount(&num_devices);
147 gpu_assert(err == gpuSuccess);
148 gpu_assert(device < num_devices);
153 virtual ~GpuStreamDevice() {
155 deallocate(scratch_);
159 const gpuStream_t& stream()
const {
return *stream_; }
160 const gpuDeviceProp_t& deviceProperties()
const {
return GetGpuDeviceProperties(device_); }
161 virtual void* allocate(
size_t num_bytes)
const {
162 gpuError_t err = gpuSetDevice(device_);
164 gpu_assert(err == gpuSuccess);
166 err = gpuMalloc(&result, num_bytes);
167 gpu_assert(err == gpuSuccess);
168 gpu_assert(result != NULL);
171 virtual void deallocate(
void* buffer)
const {
172 gpuError_t err = gpuSetDevice(device_);
174 gpu_assert(err == gpuSuccess);
175 gpu_assert(buffer != NULL);
176 err = gpuFree(buffer);
177 gpu_assert(err == gpuSuccess);
180 virtual void* scratchpad()
const {
181 if (scratch_ == NULL) {
182 scratch_ = allocate(kGpuScratchSize +
sizeof(
unsigned int));
187 virtual unsigned int* semaphore()
const {
188 if (semaphore_ == NULL) {
189 char* scratch =
static_cast<char*
>(scratchpad()) + kGpuScratchSize;
190 semaphore_ =
reinterpret_cast<unsigned int*
>(scratch);
191 gpuError_t err = gpuMemsetAsync(semaphore_, 0,
sizeof(
unsigned int), *stream_);
193 gpu_assert(err == gpuSuccess);
199 const gpuStream_t* stream_;
201 mutable void* scratch_;
202 mutable unsigned int* semaphore_;
208 explicit GpuDevice(
const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
eigen_assert(stream); }
209 explicit GpuDevice(
const StreamInterface* stream,
int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
215 EIGEN_STRONG_INLINE void* allocate(
size_t num_bytes)
const {
return stream_->allocate(num_bytes); }
219 EIGEN_STRONG_INLINE void* allocate_temp(
size_t num_bytes)
const {
return stream_->allocate(num_bytes); }
221 EIGEN_STRONG_INLINE void deallocate_temp(
void* buffer)
const { stream_->deallocate(buffer); }
223 template <
typename Type>
233 #ifndef EIGEN_GPU_COMPILE_PHASE
234 gpuError_t err = gpuMemcpyAsync(dst, src,
n, gpuMemcpyDeviceToDevice, stream_->stream());
236 gpu_assert(err == gpuSuccess);
241 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
246 gpuError_t err = gpuMemcpyAsync(dst, src,
n, gpuMemcpyHostToDevice, stream_->stream());
248 gpu_assert(err == gpuSuccess);
252 gpuError_t err = gpuMemcpyAsync(dst, src,
n, gpuMemcpyDeviceToHost, stream_->stream());
254 gpu_assert(err == gpuSuccess);
258 #ifndef EIGEN_GPU_COMPILE_PHASE
259 gpuError_t err = gpuMemsetAsync(buffer,
c,
n, stream_->stream());
261 gpu_assert(err == gpuSuccess);
266 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
270 template <
typename T>
272 #ifndef EIGEN_GPU_COMPILE_PHASE
273 const size_t count =
end - begin;
275 const int value_size =
sizeof(
value);
276 char* buffer = (
char*)begin;
277 char* value_bytes = (
char*)(&
value);
282 bool use_single_memset =
true;
283 for (
int i = 1;
i < value_size; ++
i) {
284 if (value_bytes[
i] != value_bytes[0]) {
285 use_single_memset =
false;
289 if (use_single_memset) {
290 err = gpuMemsetAsync(buffer, value_bytes[0], count *
sizeof(
T), stream_->stream());
291 gpu_assert(err == gpuSuccess);
293 for (
int b = 0;
b < value_size; ++
b) {
294 err = gpuMemset2DAsync(buffer +
b, value_size, value_bytes[
b], 1, count, stream_->stream());
295 gpu_assert(err == gpuSuccess);
302 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
319 return firstLevelCacheSize();
323 #ifndef EIGEN_GPU_COMPILE_PHASE
324 gpuError_t err = gpuStreamSynchronize(stream_->stream());
325 if (err != gpuSuccess) {
326 std::cerr <<
"Error detected in GPU stream: " << gpuGetErrorString(err) << std::endl;
327 gpu_assert(err == gpuSuccess);
330 gpu_assert(
false &&
"The default device should be used instead to generate kernel code");
334 EIGEN_STRONG_INLINE int getNumGpuMultiProcessors()
const {
return stream_->deviceProperties().multiProcessorCount; }
335 EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock()
const {
return stream_->deviceProperties().maxThreadsPerBlock; }
337 return stream_->deviceProperties().maxThreadsPerMultiProcessor;
340 return static_cast<int>(stream_->deviceProperties().sharedMemPerBlock);
342 EIGEN_STRONG_INLINE int majorDeviceVersion()
const {
return stream_->deviceProperties().major; }
343 EIGEN_STRONG_INLINE int minorDeviceVersion()
const {
return stream_->deviceProperties().minor; }
349 inline bool ok()
const {
351 gpuError_t
error = gpuStreamQuery(stream_->stream());
352 return (
error == gpuSuccess) || (
error == gpuErrorNotReady);
359 const StreamInterface* stream_;
363 #if defined(EIGEN_HIPCC)
365 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
366 hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
367 gpu_assert(hipGetLastError() == hipSuccess);
371 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
372 (kernel)<<<(gridsize), (blocksize), (sharedmem), (device).stream()>>>(__VA_ARGS__); \
373 gpu_assert(cudaGetLastError() == cudaSuccess);
379 static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
380 #ifndef EIGEN_GPU_COMPILE_PHASE
381 gpuError_t status = gpuDeviceSetSharedMemConfig(config);
383 gpu_assert(status == gpuSuccess);
int i
Definition: BiCGSTAB_step_by_step.cpp:9
const unsigned n
Definition: CG3DPackingUnitTest.cpp:11
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:966
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
int data[]
Definition: Map_placement_new.cpp:1
Scalar * b
Definition: benchVecAdd.cpp:17
static constexpr lastp1_t end
Definition: IndexedViewHelper.h:79
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
squared absolute value
Definition: GlobalFunctions.h:87
int c
Definition: calibrate.py:100
int error
Definition: calibrate.py:297
Type
Type of JSON value.
Definition: rapidjson.h:513
Container::iterator get(Container &c, Position position)
Definition: stdlist_overload.cpp:29