TensorDeviceGpu.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
11 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
12 
13 // This header file container defines fo gpu* macros which will resolve to
14 // their equivalent hip* or cuda* versions depending on the compiler in use
15 // A separate header (included at the end of this file) will undefine all
17 
18 // IWYU pragma: private
19 #include "./InternalHeaderCheck.h"
20 
21 namespace Eigen {
22 
23 static const int kGpuScratchSize = 1024;
24 
25 // This defines an interface that GPUDevice can take to use
26 // HIP / CUDA streams underneath.
27 class StreamInterface {
28  public:
29  virtual ~StreamInterface() {}
30 
31  virtual const gpuStream_t& stream() const = 0;
32  virtual const gpuDeviceProp_t& deviceProperties() const = 0;
33 
34  // Allocate memory on the actual device where the computation will run
35  virtual void* allocate(size_t num_bytes) const = 0;
36  virtual void deallocate(void* buffer) const = 0;
37 
38  // Return a scratchpad buffer of size 1k
39  virtual void* scratchpad() const = 0;
40 
41  // Return a semaphore. The semaphore is initially initialized to 0, and
42  // each kernel using it is responsible for resetting to 0 upon completion
43  // to maintain the invariant that the semaphore is always equal to 0 upon
44  // each kernel start.
45  virtual unsigned int* semaphore() const = 0;
46 };
47 
48 class GpuDeviceProperties {
49  public:
50  GpuDeviceProperties() : initialized_(false), first_(true), device_properties_(nullptr) {}
51 
52  ~GpuDeviceProperties() {
53  if (device_properties_) {
54  delete[] device_properties_;
55  }
56  }
57 
58  EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const { return device_properties_[device]; }
59 
60  EIGEN_STRONG_INLINE bool isInitialized() const { return initialized_; }
61 
62  void initialize() {
63  if (!initialized_) {
64  // Attempts to ensure proper behavior in the case of multiple threads
65  // calling this function simultaneously. This would be trivial to
66  // implement if we could use std::mutex, but unfortunately mutex don't
67  // compile with nvcc, so we resort to atomics and thread fences instead.
68  // Note that if the caller uses a compiler that doesn't support c++11 we
69  // can't ensure that the initialization is thread safe.
70  if (first_.exchange(false)) {
71  // We're the first thread to reach this point.
72  int num_devices;
73  gpuError_t status = gpuGetDeviceCount(&num_devices);
74  if (status != gpuSuccess) {
75  std::cerr << "Failed to get the number of GPU devices: " << gpuGetErrorString(status) << std::endl;
76  gpu_assert(status == gpuSuccess);
77  }
78  device_properties_ = new gpuDeviceProp_t[num_devices];
79  for (int i = 0; i < num_devices; ++i) {
80  status = gpuGetDeviceProperties(&device_properties_[i], i);
81  if (status != gpuSuccess) {
82  std::cerr << "Failed to initialize GPU device #" << i << ": " << gpuGetErrorString(status) << std::endl;
83  gpu_assert(status == gpuSuccess);
84  }
85  }
86 
87  std::atomic_thread_fence(std::memory_order_release);
88  initialized_ = true;
89  } else {
90  // Wait for the other thread to inititialize the properties.
91  while (!initialized_) {
92  std::atomic_thread_fence(std::memory_order_acquire);
93  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
94  }
95  }
96  }
97  }
98 
99  private:
100  volatile bool initialized_;
101  std::atomic<bool> first_;
102  gpuDeviceProp_t* device_properties_;
103 };
104 
105 EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
106  static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
107  if (!deviceProperties->isInitialized()) {
108  deviceProperties->initialize();
109  }
110  return *deviceProperties;
111 }
112 
113 EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
114  return GetGpuDeviceProperties().get(device);
115 }
116 
117 static const gpuStream_t default_stream = gpuStreamDefault;
118 
119 class GpuStreamDevice : public StreamInterface {
120  public:
121  // Use the default stream on the current device
122  GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
123  gpuError_t status = gpuGetDevice(&device_);
124  if (status != gpuSuccess) {
125  std::cerr << "Failed to get the GPU devices " << gpuGetErrorString(status) << std::endl;
126  gpu_assert(status == gpuSuccess);
127  }
128  }
129  // Use the default stream on the specified device
130  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
131  // Use the specified stream. Note that it's the
132  // caller responsibility to ensure that the stream can run on
133  // the specified device. If no device is specified the code
134  // assumes that the stream is associated to the current gpu device.
135  GpuStreamDevice(const gpuStream_t* stream, int device = -1)
136  : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
137  if (device < 0) {
138  gpuError_t status = gpuGetDevice(&device_);
139  if (status != gpuSuccess) {
140  std::cerr << "Failed to get the GPU devices " << gpuGetErrorString(status) << std::endl;
141  gpu_assert(status == gpuSuccess);
142  }
143  } else {
144  int num_devices;
145  gpuError_t err = gpuGetDeviceCount(&num_devices);
147  gpu_assert(err == gpuSuccess);
148  gpu_assert(device < num_devices);
149  device_ = device;
150  }
151  }
152 
153  virtual ~GpuStreamDevice() {
154  if (scratch_) {
155  deallocate(scratch_);
156  }
157  }
158 
159  const gpuStream_t& stream() const { return *stream_; }
160  const gpuDeviceProp_t& deviceProperties() const { return GetGpuDeviceProperties(device_); }
161  virtual void* allocate(size_t num_bytes) const {
162  gpuError_t err = gpuSetDevice(device_);
164  gpu_assert(err == gpuSuccess);
165  void* result;
166  err = gpuMalloc(&result, num_bytes);
167  gpu_assert(err == gpuSuccess);
168  gpu_assert(result != NULL);
169  return result;
170  }
171  virtual void deallocate(void* buffer) const {
172  gpuError_t err = gpuSetDevice(device_);
174  gpu_assert(err == gpuSuccess);
175  gpu_assert(buffer != NULL);
176  err = gpuFree(buffer);
177  gpu_assert(err == gpuSuccess);
178  }
179 
180  virtual void* scratchpad() const {
181  if (scratch_ == NULL) {
182  scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
183  }
184  return scratch_;
185  }
186 
187  virtual unsigned int* semaphore() const {
188  if (semaphore_ == NULL) {
189  char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
190  semaphore_ = reinterpret_cast<unsigned int*>(scratch);
191  gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
193  gpu_assert(err == gpuSuccess);
194  }
195  return semaphore_;
196  }
197 
198  private:
199  const gpuStream_t* stream_;
200  int device_;
201  mutable void* scratch_;
202  mutable unsigned int* semaphore_;
203 };
204 
205 struct GpuDevice {
206  // The StreamInterface is not owned: the caller is
207  // responsible for its initialization and eventual destruction.
208  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { eigen_assert(stream); }
209  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
210  eigen_assert(stream);
211  }
212  // TODO(bsteiner): This is an internal API, we should not expose it.
213  EIGEN_STRONG_INLINE const gpuStream_t& stream() const { return stream_->stream(); }
214 
215  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { return stream_->allocate(num_bytes); }
216 
217  EIGEN_STRONG_INLINE void deallocate(void* buffer) const { stream_->deallocate(buffer); }
218 
219  EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { return stream_->allocate(num_bytes); }
220 
221  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { stream_->deallocate(buffer); }
222 
223  template <typename Type>
225  return data;
226  }
227 
228  EIGEN_STRONG_INLINE void* scratchpad() const { return stream_->scratchpad(); }
229 
230  EIGEN_STRONG_INLINE unsigned int* semaphore() const { return stream_->semaphore(); }
231 
232  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
233 #ifndef EIGEN_GPU_COMPILE_PHASE
234  gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice, stream_->stream());
236  gpu_assert(err == gpuSuccess);
237 #else
241  eigen_assert(false && "The default device should be used instead to generate kernel code");
242 #endif
243  }
244 
245  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
246  gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
248  gpu_assert(err == gpuSuccess);
249  }
250 
251  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
252  gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
254  gpu_assert(err == gpuSuccess);
255  }
256 
257  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
258 #ifndef EIGEN_GPU_COMPILE_PHASE
259  gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
261  gpu_assert(err == gpuSuccess);
262 #else
263  EIGEN_UNUSED_VARIABLE(buffer)
266  eigen_assert(false && "The default device should be used instead to generate kernel code");
267 #endif
268  }
269 
270  template <typename T>
271  EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
272 #ifndef EIGEN_GPU_COMPILE_PHASE
273  const size_t count = end - begin;
274  // Split value into bytes and run memset with stride.
275  const int value_size = sizeof(value);
276  char* buffer = (char*)begin;
277  char* value_bytes = (char*)(&value);
278  gpuError_t err;
280 
281  // If all value bytes are equal, then a single memset can be much faster.
282  bool use_single_memset = true;
283  for (int i = 1; i < value_size; ++i) {
284  if (value_bytes[i] != value_bytes[0]) {
285  use_single_memset = false;
286  }
287  }
288 
289  if (use_single_memset) {
290  err = gpuMemsetAsync(buffer, value_bytes[0], count * sizeof(T), stream_->stream());
291  gpu_assert(err == gpuSuccess);
292  } else {
293  for (int b = 0; b < value_size; ++b) {
294  err = gpuMemset2DAsync(buffer + b, value_size, value_bytes[b], 1, count, stream_->stream());
295  gpu_assert(err == gpuSuccess);
296  }
297  }
298 #else
299  EIGEN_UNUSED_VARIABLE(begin)
302  eigen_assert(false && "The default device should be used instead to generate kernel code");
303 #endif
304  }
305 
306  EIGEN_STRONG_INLINE size_t numThreads() const {
307  // FIXME
308  return 32;
309  }
310 
311  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
312  // FIXME
313  return 48 * 1024;
314  }
315 
316  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
317  // We won't try to take advantage of the l2 cache for the time being, and
318  // there is no l3 cache on hip/cuda devices.
319  return firstLevelCacheSize();
320  }
321 
322  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
323 #ifndef EIGEN_GPU_COMPILE_PHASE
324  gpuError_t err = gpuStreamSynchronize(stream_->stream());
325  if (err != gpuSuccess) {
326  std::cerr << "Error detected in GPU stream: " << gpuGetErrorString(err) << std::endl;
327  gpu_assert(err == gpuSuccess);
328  }
329 #else
330  gpu_assert(false && "The default device should be used instead to generate kernel code");
331 #endif
332  }
333 
334  EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const { return stream_->deviceProperties().multiProcessorCount; }
335  EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const { return stream_->deviceProperties().maxThreadsPerBlock; }
336  EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
337  return stream_->deviceProperties().maxThreadsPerMultiProcessor;
338  }
339  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
340  return static_cast<int>(stream_->deviceProperties().sharedMemPerBlock);
341  }
342  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return stream_->deviceProperties().major; }
343  EIGEN_STRONG_INLINE int minorDeviceVersion() const { return stream_->deviceProperties().minor; }
344 
345  EIGEN_STRONG_INLINE int maxBlocks() const { return max_blocks_; }
346 
347  // This function checks if the GPU runtime recorded an error for the
348  // underlying stream device.
349  inline bool ok() const {
350 #ifdef EIGEN_GPUCC
351  gpuError_t error = gpuStreamQuery(stream_->stream());
352  return (error == gpuSuccess) || (error == gpuErrorNotReady);
353 #else
354  return false;
355 #endif
356  }
357 
358  private:
359  const StreamInterface* stream_;
360  int max_blocks_;
361 };
362 
363 #if defined(EIGEN_HIPCC)
364 
365 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
366  hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
367  gpu_assert(hipGetLastError() == hipSuccess);
368 
369 #else
370 
371 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
372  (kernel)<<<(gridsize), (blocksize), (sharedmem), (device).stream()>>>(__VA_ARGS__); \
373  gpu_assert(cudaGetLastError() == cudaSuccess);
374 
375 #endif
376 
377 // FIXME: Should be device and kernel specific.
378 #ifdef EIGEN_GPUCC
379 static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
380 #ifndef EIGEN_GPU_COMPILE_PHASE
381  gpuError_t status = gpuDeviceSetSharedMemConfig(config);
382  EIGEN_UNUSED_VARIABLE(status)
383  gpu_assert(status == gpuSuccess);
384 #else
385  EIGEN_UNUSED_VARIABLE(config)
386 #endif
387 }
388 #endif
389 
390 } // end namespace Eigen
391 
392 // undefine all the gpu* macros we defined at the beginning of the file
394 
395 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
int i
Definition: BiCGSTAB_step_by_step.cpp:9
const unsigned n
Definition: CG3DPackingUnitTest.cpp:11
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:966
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
int data[]
Definition: Map_placement_new.cpp:1
Scalar * b
Definition: benchVecAdd.cpp:17
static constexpr lastp1_t end
Definition: IndexedViewHelper.h:79
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
squared absolute value
Definition: GlobalFunctions.h:87
int c
Definition: calibrate.py:100
int error
Definition: calibrate.py:297
Type
Type of JSON value.
Definition: rapidjson.h:513
Container::iterator get(Container &c, Position position)
Definition: stdlist_overload.cpp:29