12 #define MGPU_SYNC_CHECK(s) { \
13 cudaError_t error = cudaDeviceSynchronize(); \
14 if(cudaSuccess != error) { \
15 printf("CUDA ERROR %d %s\n%s:%d.\n%s\n", \
16 error, cudaGetErrorString(error), __FILE__, __LINE__, s); \
21 #define MGPU_SYNC_CHECK(s)
25 void copyDtoH(T* dest,
const T* source,
int count) {
26 cudaMemcpy(dest, source,
sizeof(T) * count, cudaMemcpyDeviceToHost);
29 void copyDtoD(T* dest,
const T* source,
int count, cudaStream_t
stream = 0) {
30 cudaMemcpyAsync(dest, source,
sizeof(T) * count, cudaMemcpyDeviceToDevice,
34 void copyDtoH(std::vector<T>& dest,
const T* source,
int count) {
41 void copyHtoD(T* dest,
const T* source,
int count) {
42 cudaMemcpy(dest, source,
sizeof(T) * count, cudaMemcpyHostToDevice);
45 void copyHtoD(T* dest,
const std::vector<T>& source) {
47 copyHtoD(dest, &source[0], source.size());
65 virtual const char*
what()
const throw() {
66 return "CUDA runtime error";
78 cudaEventCreate(&_event);
81 cudaEventCreateWithFlags(&_event, flags);
84 cudaEventDestroy(_event);
86 operator cudaEvent_t() {
return _event; }
99 double Throughput(
int count,
int numIterations);
115 const cudaDeviceProp&
Prop()
const {
return _prop; }
117 int NumSMs()
const {
return _prop.multiProcessorCount; }
118 int ArchVersion()
const {
return 100 * _prop.major + 10 * _prop.minor; }
132 cudaDeviceProp _prop;
148 const T*
get()
const {
return _p; }
149 T*
get() {
return _p; }
151 operator const T*()
const {
return get(); }
152 operator T*() {
return get(); }
155 size_t Size()
const {
return _size; }
158 cudaError_t
ToDevice(T* data,
size_t count)
const;
159 cudaError_t
ToDevice(
size_t srcOffest,
size_t bytes,
void* data)
const;
160 cudaError_t
ToHost(T* data,
size_t count)
const;
161 cudaError_t
ToHost(std::vector<T>& data)
const;
162 cudaError_t
ToHost(std::vector<T>& data,
size_t count)
const;
163 cudaError_t
ToHost(
size_t srcOffset,
size_t bytes,
void* data)
const;
166 cudaError_t
FromDevice(
const T* data,
size_t count);
167 cudaError_t
FromDevice(
size_t dstOffset,
size_t bytes,
const void* data);
168 cudaError_t
FromHost(
const std::vector<T>& data);
169 cudaError_t
FromHost(
const std::vector<T>& data,
size_t count);
170 cudaError_t
FromHost(
const T* data,
size_t count);
171 cudaError_t
FromHost(
size_t destOffset,
size_t bytes,
const void* data);
182 typedef intrusive_ptr<CudaAlloc>
AllocPtr;
183 #define MGPU_MEM(type) mgpu::intrusive_ptr< mgpu::CudaDeviceMem< type > >
226 MGPU_MEM(T) SortRandom(
size_t count, T min, T max);
228 template<typename T, typename Func>
248 bool printInfo =
false);
275 cudaStream_t
Stream()
const {
return _stream; }
288 return _timer.
Throughput(count, numIterations);
304 cudaStream_t _stream;
305 cudaStream_t _auxStream;
317 return ToDevice(0,
sizeof(T) * count, data);
322 cudaError_t error = cudaMemcpy(data, (
char*)_p + srcOffset, bytes,
323 cudaMemcpyDeviceToDevice);
324 if(cudaSuccess != error) {
325 printf(
"CudaDeviceMem::ToDevice copy error %d\n", error);
333 return ToHost(0,
sizeof(T) * count, data);
338 cudaError_t error = cudaSuccess;
339 if(_size) error = ToHost(&data[0], count);
344 return ToHost(data, _size);
350 cudaError_t error = cudaMemcpy(data, (
char*)_p + srcOffset, bytes,
351 cudaMemcpyDeviceToHost);
352 if(cudaSuccess != error) {
353 printf(
"CudaDeviceMem::ToHost copy error %d\n", error);
361 return FromDevice(0,
sizeof(T) * count, data);
366 if(dstOffset + bytes >
sizeof(T) * _size)
367 return cudaErrorInvalidValue;
368 cudaMemcpy(_p + dstOffset, data, bytes, cudaMemcpyDeviceToDevice);
374 cudaError_t error = cudaSuccess;
375 if(data.size()) error = FromHost(&data[0], count);
380 return FromHost(data, data.size());
384 return FromHost(0,
sizeof(T) * count, data);
389 if(dstOffset + bytes >
sizeof(T) * _size)
390 return cudaErrorInvalidValue;
391 cudaMemcpy(_p + dstOffset, data, bytes, cudaMemcpyHostToDevice);
406 cudaError_t error = _alloc->Malloc(
sizeof(T) * count, (
void**)&mem->_p);
407 if(cudaSuccess != error) {
408 printf(
"cudaMalloc error %d\n", error);
421 MGPU_MEM(T) CudaMemSupport::Malloc(
const T* data,
size_t count) {
423 mem->FromHost(data, count);
428 MGPU_MEM(T) CudaMemSupport::Malloc(
const std::vector<T>& data) {
429 MGPU_MEM(T) mem = Malloc<T>(data.size());
430 if(data.size()) mem->FromHost(&data[0], data.size());
435 MGPU_MEM(T) CudaMemSupport::Fill(
size_t count, T fill) {
436 std::vector<T> data(count, fill);
441 MGPU_MEM(T) CudaMemSupport::FillAscending(
size_t count, T first, T step) {
442 std::vector<T> data(count);
443 for(
size_t i = 0; i < count; ++i)
444 data[i] = first + i * step;
449 MGPU_MEM(T) CudaMemSupport::GenRandom(
size_t count, T
min, T
max) {
450 std::vector<T> data(count);
451 for(
size_t i = 0; i < count; ++i)
452 data[i] =
Rand(min, max);
457 MGPU_MEM(T) CudaMemSupport::SortRandom(
size_t count, T min, T max) {
458 std::vector<T> data(count);
459 for(
size_t i = 0; i < count; ++i)
460 data[i] =
Rand(min, max);
465 template<
typename T,
typename Func>
467 std::vector<T> data(count);
468 for(
size_t i = 0; i < count; ++i)
472 mem->FromHost(data, count);
479 template<
typename T,
typename Op>
487 template<
typename T,
typename Op>
496 printf(
"%s", s.c_str());
503 template<
typename T,
typename Op>
506 printf(
"%s", s.c_str());