NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
mgpucontext.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include "util/util.h"
4 #include "util/format.h"
5 #include "mgpualloc.h"
6 #include <cuda.h>
7 
8 namespace mgpu {
9 
10 
11 #ifdef _DEBUG
12 #define MGPU_SYNC_CHECK(s) { \
13  cudaError_t error = cudaDeviceSynchronize(); \
14  if(cudaSuccess != error) { \
15  printf("CUDA ERROR %d %s\n%s:%d.\n%s\n", \
16  error, cudaGetErrorString(error), __FILE__, __LINE__, s); \
17  exit(0); \
18  } \
19 }
20 #else
21 #define MGPU_SYNC_CHECK(s)
22 #endif
23 
24 template<typename T>
25 void copyDtoH(T* dest, const T* source, int count) {
26  cudaMemcpy(dest, source, sizeof(T) * count, cudaMemcpyDeviceToHost);
27 }
28 template<typename T>
29 void copyDtoD(T* dest, const T* source, int count, cudaStream_t stream = 0) {
30  cudaMemcpyAsync(dest, source, sizeof(T) * count, cudaMemcpyDeviceToDevice,
31  stream);
32 }
33 template<typename T>
34 void copyDtoH(std::vector<T>& dest, const T* source, int count) {
35  dest.resize(count);
36  if(count)
37  copyDtoH(&dest[0], source, count);
38 }
39 
40 template<typename T>
41 void copyHtoD(T* dest, const T* source, int count) {
42  cudaMemcpy(dest, source, sizeof(T) * count, cudaMemcpyHostToDevice);
43 }
44 template<typename T>
45 void copyHtoD(T* dest, const std::vector<T>& source) {
46  if(source.size())
47  copyHtoD(dest, &source[0], source.size());
48 }
49 
50 
52 
53 class CudaContext;
54 typedef intrusive_ptr<CudaContext> ContextPtr;
56 
57 class CudaException : public std::exception {
58 public:
59  cudaError_t error;
60 
61  CudaException() throw() { }
62  CudaException(cudaError_t e) throw() : error(e) { }
63  CudaException(const CudaException& e) throw() : error(e.error) { }
64 
65  virtual const char* what() const throw() {
66  return "CUDA runtime error";
67  }
68 };
69 
70 
72 // CudaEvent and CudaTimer.
73 // Exception-safe wrappers around cudaEvent_t.
74 
75 class CudaEvent : public noncopyable {
76 public:
77  CudaEvent() {
78  cudaEventCreate(&_event);
79  }
80  explicit CudaEvent(int flags) {
81  cudaEventCreateWithFlags(&_event, flags);
82  }
84  cudaEventDestroy(_event);
85  }
86  operator cudaEvent_t() { return _event; }
87  void Swap(CudaEvent& rhs) {
88  std::swap(_event, rhs._event);
89  }
90 private:
91  cudaEvent_t _event;
92 };
93 
95  CudaEvent start, end;
96 public:
97  void Start();
98  double Split();
99  double Throughput(int count, int numIterations);
100 };
101 
102 
104 
105 struct DeviceGroup;
106 
107 class CudaDevice : public noncopyable {
108  friend struct DeviceGroup;
109 public:
110  static int DeviceCount();
111  static CudaDevice& ByOrdinal(int ordinal);
112  static CudaDevice& Selected();
113 
114  // Device properties.
115  const cudaDeviceProp& Prop() const { return _prop; }
116  int Ordinal() const { return _ordinal; }
117  int NumSMs() const { return _prop.multiProcessorCount; }
118  int ArchVersion() const { return 100 * _prop.major + 10 * _prop.minor; }
119 
120  // LaunchBox properties.
121  int PTXVersion() const { return _ptxVersion; }
122 
123  std::string DeviceString() const;
124 
125  // Set this device as the active device on the thread.
126  void SetActive();
127 
128 private:
129  CudaDevice() { } // hide the destructor.
130  int _ordinal;
131  int _ptxVersion;
132  cudaDeviceProp _prop;
133 };
134 
136 // CudaDeviceMem
137 // Exception-safe CUDA device memory container. Use the MGPU_MEM(T) macro for
138 // the type of the reference-counting container.
139 // CudaDeviceMem AddRefs the allocator that returned the memory, releasing the
140 // pointer when the object is destroyed.
141 
142 template<typename T>
143 class CudaDeviceMem : public CudaBase {
144  friend class CudaMemSupport;
145 public:
146  ~CudaDeviceMem();
147 
148  const T* get() const { return _p; }
149  T* get() { return _p; }
150 
151  operator const T*() const { return get(); }
152  operator T*() { return get(); }
153 
154  // Size is in units of T, not bytes.
155  size_t Size() const { return _size; }
156 
157  // Copy from this to the argument array.
158  cudaError_t ToDevice(T* data, size_t count) const;
159  cudaError_t ToDevice(size_t srcOffest, size_t bytes, void* data) const;
160  cudaError_t ToHost(T* data, size_t count) const;
161  cudaError_t ToHost(std::vector<T>& data) const;
162  cudaError_t ToHost(std::vector<T>& data, size_t count) const;
163  cudaError_t ToHost(size_t srcOffset, size_t bytes, void* data) const;
164 
165  // Copy from the argument array to this.
166  cudaError_t FromDevice(const T* data, size_t count);
167  cudaError_t FromDevice(size_t dstOffset, size_t bytes, const void* data);
168  cudaError_t FromHost(const std::vector<T>& data);
169  cudaError_t FromHost(const std::vector<T>& data, size_t count);
170  cudaError_t FromHost(const T* data, size_t count);
171  cudaError_t FromHost(size_t destOffset, size_t bytes, const void* data);
172 
173 private:
174  friend class CudaContext;
175  CudaDeviceMem(CudaAlloc* alloc) : _p(0), _size(0), _alloc(alloc) { }
176 
177  AllocPtr _alloc;
178  T* _p;
179  size_t _size;
180 };
181 
182 typedef intrusive_ptr<CudaAlloc> AllocPtr;
183 #define MGPU_MEM(type) mgpu::intrusive_ptr< mgpu::CudaDeviceMem< type > >
184 
186 // CudaMemSupport
187 // Convenience functions for allocating device memory and copying to it from
188 // the host. These functions are factored into their own class for clarity.
189 // The class is derived by CudaContext.
190 
191 class CudaMemSupport : public CudaBase {
192  friend class CudaDevice;
193  friend class CudaContext;
194 public:
195  CudaDevice& Device() { return _alloc->Device(); }
196 
197  // Swap out the associated allocator.
199  assert(alloc->Device().Ordinal() == _alloc->Device().Ordinal());
200  _alloc.reset(alloc);
201  }
202 
203  // Access the associated allocator.
204  CudaAlloc* GetAllocator() { return _alloc.get(); }
205 
206  // Support for creating arrays.
207  template<typename T>
208  MGPU_MEM(T) Malloc(size_t count);
209 
210  template<typename T>
211  MGPU_MEM(T) Malloc(const T* data, size_t count);
212 
213  template<typename T>
214  MGPU_MEM(T) Malloc(const std::vector<T>& data);
215 
216  template<typename T>
217  MGPU_MEM(T) Fill(size_t count, T fill);
218 
219  template<typename T>
220  MGPU_MEM(T) FillAscending(size_t count, T first, T step);
221 
222  template<typename T>
223  MGPU_MEM(T) GenRandom(size_t count, T min, T max);
224 
225  template<typename T>
226  MGPU_MEM(T) SortRandom(size_t count, T min, T max);
227 
228  template<typename T, typename Func>
229  MGPU_MEM(T) GenFunc(size_t count, Func f);
230 
231 protected:
234 };
235 
237 
238 class CudaContext;
240 
241 // Create a context on the default stream (0).
242 ContextPtr CreateCudaDevice(int ordinal);
243 ContextPtr CreateCudaDevice(int argc, char** argv, bool printInfo = false);
244 
245 // Create a context on a new stream.
247 ContextPtr CreateCudaDeviceStream(int argc, char** argv,
248  bool printInfo = false);
249 
250 // Create a context and attach to an existing stream.
252 ContextPtr CreateCudaDeviceAttachStream(int ordinal, cudaStream_t stream);
253 
254 struct ContextGroup;
255 
256 class CudaContext : public CudaMemSupport {
257  friend struct ContextGroup;
258 
259  friend ContextPtr CreateCudaDevice(int ordinal);
260  friend ContextPtr CreateCudaDeviceStream(int ordinal);
261  friend ContextPtr CreateCudaDeviceAttachStream(int ordinal,
262  cudaStream_t stream);
263 public:
264  static CudaContext& StandardContext(int ordinal = -1);
265 
266  // 4KB of page-locked memory per context.
267  int* PageLocked() { return _pageLocked; }
268  cudaStream_t AuxStream() const { return _auxStream; }
269 
270  int NumSMs() { return Device().NumSMs(); }
271  int ArchVersion() { return Device().ArchVersion(); }
272  int PTXVersion() { return Device().PTXVersion(); }
273  std::string DeviceString() { return Device().DeviceString(); }
274 
275  cudaStream_t Stream() const { return _stream; }
276 
277  // Set this device as the active device on the thread.
278  void SetActive() { Device().SetActive(); }
279 
280  // Access the included event.
281  CudaEvent& Event() { return _event; }
282 
283  // Use the included timer.
284  CudaTimer& Timer() { return _timer; }
285  void Start() { _timer.Start(); }
286  double Split() { return _timer.Split(); }
287  double Throughput(int count, int numIterations) {
288  return _timer.Throughput(count, numIterations);
289  }
290 
291  virtual long AddRef() {
292  return _noRefCount ? 1 : CudaMemSupport::AddRef();
293  }
294  virtual void Release() {
295  if(!_noRefCount) CudaMemSupport::Release();
296  }
297 private:
298  CudaContext(CudaDevice& device, bool newStream, bool standard);
299  ~CudaContext();
300 
301  AllocPtr CreateDefaultAlloc(CudaDevice& device);
302 
303  bool _ownStream;
304  cudaStream_t _stream;
305  cudaStream_t _auxStream;
306  CudaEvent _event;
307  CudaTimer _timer;
308  bool _noRefCount;
309  int* _pageLocked;
310 };
311 
313 // CudaDeviceMem method implementations
314 
315 template<typename T>
316 cudaError_t CudaDeviceMem<T>::ToDevice(T* data, size_t count) const {
317  return ToDevice(0, sizeof(T) * count, data);
318 }
319 template<typename T>
320 cudaError_t CudaDeviceMem<T>::ToDevice(size_t srcOffset, size_t bytes,
321  void* data) const {
322  cudaError_t error = cudaMemcpy(data, (char*)_p + srcOffset, bytes,
323  cudaMemcpyDeviceToDevice);
324  if(cudaSuccess != error) {
325  printf("CudaDeviceMem::ToDevice copy error %d\n", error);
326  exit(0);
327  }
328  return error;
329 }
330 
331 template<typename T>
332 cudaError_t CudaDeviceMem<T>::ToHost(T* data, size_t count) const {
333  return ToHost(0, sizeof(T) * count, data);
334 }
335 template<typename T>
336 cudaError_t CudaDeviceMem<T>::ToHost(std::vector<T>& data, size_t count) const {
337  data.resize(count);
338  cudaError_t error = cudaSuccess;
339  if(_size) error = ToHost(&data[0], count);
340  return error;
341 }
342 template<typename T>
343 cudaError_t CudaDeviceMem<T>::ToHost(std::vector<T>& data) const {
344  return ToHost(data, _size);
345 }
346 template<typename T>
347 cudaError_t CudaDeviceMem<T>::ToHost(size_t srcOffset, size_t bytes,
348  void* data) const {
349 
350  cudaError_t error = cudaMemcpy(data, (char*)_p + srcOffset, bytes,
351  cudaMemcpyDeviceToHost);
352  if(cudaSuccess != error) {
353  printf("CudaDeviceMem::ToHost copy error %d\n", error);
354  exit(0);
355  }
356  return error;
357 }
358 
359 template<typename T>
360 cudaError_t CudaDeviceMem<T>::FromDevice(const T* data, size_t count) {
361  return FromDevice(0, sizeof(T) * count, data);
362 }
363 template<typename T>
364 cudaError_t CudaDeviceMem<T>::FromDevice(size_t dstOffset, size_t bytes,
365  const void* data) {
366  if(dstOffset + bytes > sizeof(T) * _size)
367  return cudaErrorInvalidValue;
368  cudaMemcpy(_p + dstOffset, data, bytes, cudaMemcpyDeviceToDevice);
369  return cudaSuccess;
370 }
371 template<typename T>
372 cudaError_t CudaDeviceMem<T>::FromHost(const std::vector<T>& data,
373  size_t count) {
374  cudaError_t error = cudaSuccess;
375  if(data.size()) error = FromHost(&data[0], count);
376  return error;
377 }
378 template<typename T>
379 cudaError_t CudaDeviceMem<T>::FromHost(const std::vector<T>& data) {
380  return FromHost(data, data.size());
381 }
382 template<typename T>
383 cudaError_t CudaDeviceMem<T>::FromHost(const T* data, size_t count) {
384  return FromHost(0, sizeof(T) * count, data);
385 }
386 template<typename T>
387 cudaError_t CudaDeviceMem<T>::FromHost(size_t dstOffset, size_t bytes,
388  const void* data) {
389  if(dstOffset + bytes > sizeof(T) * _size)
390  return cudaErrorInvalidValue;
391  cudaMemcpy(_p + dstOffset, data, bytes, cudaMemcpyHostToDevice);
392  return cudaSuccess;
393 }
394 template<typename T>
396  _alloc->Free(_p);
397 }
398 
400 // CudaMemSupport method implementations
401 
402 template<typename T>
403 MGPU_MEM(T) CudaMemSupport::Malloc(size_t count) {
404  MGPU_MEM(T) mem(new CudaDeviceMem<T>(_alloc.get()));
405  mem->_size = count;
406  cudaError_t error = _alloc->Malloc(sizeof(T) * count, (void**)&mem->_p);
407  if(cudaSuccess != error) {
408  printf("cudaMalloc error %d\n", error);
409  exit(0);
410  throw CudaException(cudaErrorMemoryAllocation);
411  }
412 #ifdef DEBUG
413  // Initialize the memory to -1 in debug mode.
414 // cudaMemset(mem->get(), -1, count);
415 #endif
416 
417  return mem;
418 }
419 
420 template<typename T>
421 MGPU_MEM(T) CudaMemSupport::Malloc(const T* data, size_t count) {
422  MGPU_MEM(T) mem = Malloc<T>(count);
423  mem->FromHost(data, count);
424  return mem;
425 }
426 
427 template<typename T>
428 MGPU_MEM(T) CudaMemSupport::Malloc(const std::vector<T>& data) {
429  MGPU_MEM(T) mem = Malloc<T>(data.size());
430  if(data.size()) mem->FromHost(&data[0], data.size());
431  return mem;
432 }
433 
434 template<typename T>
435 MGPU_MEM(T) CudaMemSupport::Fill(size_t count, T fill) {
436  std::vector<T> data(count, fill);
437  return Malloc(data);
438 }
439 
440 template<typename T>
441 MGPU_MEM(T) CudaMemSupport::FillAscending(size_t count, T first, T step) {
442  std::vector<T> data(count);
443  for(size_t i = 0; i < count; ++i)
444  data[i] = first + i * step;
445  return Malloc(data);
446 }
447 
448 template<typename T>
449 MGPU_MEM(T) CudaMemSupport::GenRandom(size_t count, T min, T max) {
450  std::vector<T> data(count);
451  for(size_t i = 0; i < count; ++i)
452  data[i] = Rand(min, max);
453  return Malloc(data);
454 }
455 
456 template<typename T>
457 MGPU_MEM(T) CudaMemSupport::SortRandom(size_t count, T min, T max) {
458  std::vector<T> data(count);
459  for(size_t i = 0; i < count; ++i)
460  data[i] = Rand(min, max);
461  std::sort(data.begin(), data.end());
462  return Malloc(data);
463 }
464 
465 template<typename T, typename Func>
466 MGPU_MEM(T) CudaMemSupport::GenFunc(size_t count, Func f) {
467  std::vector<T> data(count);
468  for(size_t i = 0; i < count; ++i)
469  data[i] = f(i);
470 
471  MGPU_MEM(T) mem = Malloc<T>(count);
472  mem->FromHost(data, count);
473  return mem;
474 }
475 
477 // Format methods that operate directly on device mem.
478 
479 template<typename T, typename Op>
480 std::string FormatArrayOp(const CudaDeviceMem<T>& mem, int count, Op op,
481  int numCols) {
482  std::vector<T> host;
483  mem.ToHost(host, count);
484  return FormatArrayOp(host, op, numCols);
485 }
486 
487 template<typename T, typename Op>
488 std::string FormatArrayOp(const CudaDeviceMem<T>& mem, Op op, int numCols) {
489  return FormatArrayOp(mem, mem.Size(), op, numCols);
490 }
491 
492 template<typename T>
493 void PrintArray(const CudaDeviceMem<T>& mem, int count, const char* format,
494  int numCols) {
495  std::string s = FormatArrayOp(mem, count, FormatOpPrintf(format), numCols);
496  printf("%s", s.c_str());
497 }
498 
499 template<typename T>
500 void PrintArray(const CudaDeviceMem<T>& mem, const char* format, int numCols) {
501  PrintArray(mem, mem.Size(), format, numCols);
502 }
503 template<typename T, typename Op>
504 void PrintArrayOp(const CudaDeviceMem<T>& mem, Op op, int numCols) {
505  std::string s = FormatArrayOp(mem, op, numCols);
506  printf("%s", s.c_str());
507 }
508 
509 
511 
512 
513 } // namespace mgpu