Fermat
buffers.h
1 /*
2  * Fermat
3  *
4  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #pragma once
30 
31 #include "types.h"
32 
33 #include <cuda_runtime.h>
34 #include <stdlib.h>
35 #include <iostream>
36 #include <string>
37 #include <vector>
38 
39 //------------------------------------------------------------------------------
40 #define CHK_CUDA( code ) \
41 { \
42  cudaError_t err__ = code; \
43  if( err__ != cudaSuccess ) \
44  { \
45  std::cerr << "Error on line " << __LINE__ << ":" \
46  << cudaGetErrorString( err__ ) << std::endl; \
47  exit(1); \
48  } \
49 }
50 
51 //------------------------------------------------------------------------------
52 
53 //------------------------------------------------------------------------------
54 enum PageLockedState
55 {
56  UNLOCKED,
57  LOCKED
58 };
59 
60 enum BufferType
61 {
62  HOST_BUFFER = 0x200,
63  CUDA_BUFFER = 0x201,
64 };
65 
66 template<typename T> class Buffer;
67 template<typename T> class ManagedBuffer;
68 
69 //------------------------------------------------------------------------------
70 //
71 // Simple buffer class for buffers residing in managed memory
72 //
73 template<typename T>
74 class ManagedBuffer
75 {
76 public:
77  ManagedBuffer(size_t count = 0)
78  : m_ptr(0), m_count(0)
79  {
80  alloc(count);
81  }
82 
84  : m_ptr(0), m_count(0)
85  {
86  this->operator=(src);
87  }
88 
89  ManagedBuffer<T>& operator=(const ManagedBuffer<T>& src)
90  {
91  alloc(src.count());
92 
93  memcpy(m_ptr, src.ptr(), sizeInBytes());
94  return *this;
95  }
96 
97  // Allocate without changing type
98  void alloc(size_t count)
99  {
100  free();
101 
102  cudaMallocManaged(&m_ptr, sizeof(T)*count);
103 
104  m_count = count;
105  }
106 
107  void free()
108  {
109  if (m_ptr)
110  cudaFree(m_ptr);
111 
112  m_ptr = 0;
113  m_count = 0;
114  }
115 
116  ~ManagedBuffer() { free(); }
117 
118  size_t count() const { return m_count; }
119  size_t sizeInBytes() const { return m_count * sizeof(T); }
120  const T* ptr() const { return m_ptr; }
121  T* ptr() { return m_ptr; }
122 
123 protected:
124  T* m_ptr;
125  size_t m_count;
126 };
127 
128 //------------------------------------------------------------------------------
129 //
130 // Simple buffer class for buffers on the host or CUDA device
131 //
132 template<typename T>
133 class Buffer
134 {
135 public:
136  Buffer(size_t count = 0, BufferType type = HOST_BUFFER, PageLockedState pageLockedState = UNLOCKED)
137  : m_ptr( 0 ),
138  m_device( 0 ),
139  m_count( 0 ),
140  m_type( type ),
141  m_pageLockedState( pageLockedState )
142  {
143  alloc( count, type, pageLockedState );
144  }
145 
146  Buffer(const Buffer<T>& src) : Buffer(0,src.type())
147  {
148  this->operator=(src);
149  }
150 
151  Buffer<T>& operator=(const Buffer<T>& src)
152  {
153  alloc(src.count());
154 
155  if (src.type() == HOST_BUFFER)
156  {
157  if (type() == HOST_BUFFER)
158  memcpy(m_ptr, src.ptr(), sizeInBytes());
159  else
160  cudaMemcpy(m_ptr, src.ptr(), sizeInBytes(), cudaMemcpyHostToDevice);
161  }
162  else
163  {
164  if (type() == HOST_BUFFER)
165  cudaMemcpy(m_ptr, src.ptr(), sizeInBytes(), cudaMemcpyDeviceToHost);
166  else
167  cudaMemcpy(m_ptr, src.ptr(), sizeInBytes(), cudaMemcpyDeviceToDevice);
168  }
169  return *this;
170  }
171  Buffer<T>& operator=(const ManagedBuffer<T>& src)
172  {
173  alloc(src.count());
174 
175  if (type() == HOST_BUFFER)
176  memcpy(m_ptr, src.ptr(), sizeInBytes());
177  else
178  cudaMemcpy(m_ptr, src.ptr(), sizeInBytes(), cudaMemcpyHostToDevice);
179 
180  return *this;
181  }
182 
183  // Allocate without changing type
184  void alloc( size_t count )
185  {
186  alloc( count, m_type, m_pageLockedState );
187  }
188 
189  void alloc(size_t count, BufferType type, PageLockedState pageLockedState = UNLOCKED)
190  {
191  if (m_ptr)
192  free();
193 
194  m_type = type;
195  m_count = count;
196  if (m_count > 0)
197  {
198  if (m_type == HOST_BUFFER)
199  {
200  m_ptr = new T[m_count];
201  if( pageLockedState == LOCKED )
202  cudaHostRegister( m_ptr, sizeInBytes(), cudaHostRegisterDefault ); // for improved transfer performance
203  m_pageLockedState = pageLockedState;
204  }
205  else
206  {
207  CHK_CUDA( cudaGetDevice( &m_device ) );
208  CHK_CUDA( cudaMalloc( &m_ptr, sizeInBytes() ) );
209  }
210  }
211  }
212 
213  void resize(const size_t count)
214  {
215  Buffer<T> buffer( count, m_type, m_pageLockedState );
216  buffer.copy_from( count < m_count ? count : m_count, m_type, m_ptr );
217 
218  swap( buffer );
219  }
220 
221  void copy_from(const size_t count, const BufferType src_type, const T* src, const uint32 dst_offset = 0)
222  {
223  assert(dst_offset + count <= m_count);
224  if (count == 0)
225  return;
226 
227  if (m_type == HOST_BUFFER)
228  {
229  if (src_type == HOST_BUFFER)
230  memcpy( m_ptr + dst_offset, src, sizeof(T)*count );
231  else
232  {
233  CHK_CUDA( cudaMemcpy( m_ptr + dst_offset, src, sizeof(T)*count, cudaMemcpyDeviceToHost ) );
234  }
235  }
236  else
237  {
238  if (src_type == HOST_BUFFER)
239  {
240  CHK_CUDA( cudaMemcpy( m_ptr + dst_offset, src, sizeof(T)*count, cudaMemcpyHostToDevice ) );
241  }
242  else
243  {
244  CHK_CUDA( cudaMemcpy( m_ptr + dst_offset, src, sizeof(T)*count, cudaMemcpyDeviceToDevice ) );
245  }
246  }
247  }
248 
249  void clear(const uint8 byte)
250  {
251  if (m_type == HOST_BUFFER)
252  memset(m_ptr, byte, sizeInBytes());
253  else
254  cudaMemset(m_ptr, byte, sizeInBytes());
255  }
256 
257  void free()
258  {
259  if (m_ptr)
260  {
261  if (m_type == HOST_BUFFER)
262  {
263  if (m_pageLockedState == LOCKED)
264  cudaHostUnregister(m_ptr);
265  delete[] m_ptr;
266  }
267  else
268  {
269  int oldDevice;
270  CHK_CUDA(cudaGetDevice(&oldDevice));
271  CHK_CUDA(cudaSetDevice(m_device));
272  CHK_CUDA(cudaFree(m_ptr));
273  CHK_CUDA(cudaSetDevice(oldDevice));
274  }
275  }
276 
277  m_ptr = 0;
278  m_count = 0;
279  }
280 
281  ~Buffer() { free(); }
282 
283  size_t count() const { return m_count; }
284  size_t sizeInBytes() const { return m_count * sizeof(T); }
285  const T* ptr() const { return m_ptr; }
286  T* ptr() { return m_ptr; }
287  BufferType type() const { return m_type; }
288 
289  T operator[] (const size_t i) const
290  {
291  if (m_type == HOST_BUFFER)
292  return m_ptr[i];
293  else
294  {
295  T t;
296  cudaMemcpy( &t, m_ptr + i, sizeof(T), cudaMemcpyDeviceToHost);
297  return t;
298  }
299  }
300 
301  T& operator[] (const size_t i)
302  {
303  if (m_type == HOST_BUFFER)
304  return m_ptr[i];
305  else
306  {
307  static T t; // FIXME: this not thread-safe!
308  cudaMemcpy( &t, m_ptr + i, sizeof(T), cudaMemcpyDeviceToHost);
309  return t;
310  }
311  }
312 
313  void set(const size_t i, const T val)
314  {
315  if (m_type == HOST_BUFFER)
316  m_ptr[i] = val;
317  else
318  cudaMemcpy(m_ptr + i, &val, sizeof(T), cudaMemcpyHostToDevice);
319  }
320 
321  void swap(Buffer<T>& buf)
322  {
323  std::swap(m_type, buf.m_type);
324  std::swap(m_ptr, buf.m_ptr);
325  std::swap(m_device, buf.m_device);
326  std::swap(m_count, buf.m_count);
327  std::swap(m_pageLockedState, buf.m_pageLockedState);
328  }
329 
330 protected:
331  BufferType m_type;
332  T* m_ptr;
333  int m_device;
334  size_t m_count;
335  PageLockedState m_pageLockedState;
336 };
337 
338 //------------------------------------------------------------------------------
339 //
340 // Simple buffer class for buffers on the host or CUDA device
341 //
342 template<BufferType TYPE, typename T>
343 class DomainBuffer : public Buffer<T>
344 {
345 public:
346  DomainBuffer(size_t count = 0, PageLockedState pageLockedState = UNLOCKED)
347  : Buffer(count, TYPE, pageLockedState)
348  {}
349 
350  template <BufferType UTYPE>
351  DomainBuffer(const DomainBuffer<UTYPE, T>& src) : Buffer(0, TYPE)
352  {
353  this->operator=(src);
354  }
355 
356  DomainBuffer<TYPE, T>& operator=(const Buffer<T>& src)
357  {
358  this->Buffer<T>::operator=(src);
359  return *this;
360  }
361  DomainBuffer<TYPE, T>& operator=(const ManagedBuffer<T>& src)
362  {
363  this->Buffer<T>::operator=(src);
364  return *this;
365  }
366 };
367 
368 FERMAT_API_EXTERN template class FERMAT_API DomainBuffer<HOST_BUFFER, float>;
369 FERMAT_API_EXTERN template class FERMAT_API DomainBuffer<CUDA_BUFFER, float>;
370 FERMAT_API_EXTERN template class FERMAT_API DomainBuffer<HOST_BUFFER, float4>;
371 FERMAT_API_EXTERN template class FERMAT_API DomainBuffer<CUDA_BUFFER, float4>;
372 
373 
374 inline float3 ptr_to_float3(const float* v) { return make_float3(v[0], v[1], v[2]); }
Definition: buffers.h:66
Definition: buffers.h:67
Definition: buffers.h:343