Fermat
arch_inl.h
1 /*
2  * cugar
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 namespace cugar {
29 namespace cuda {
30 
32 {
33  if (cuda_devices::s_cuda_devices == NULL)
34  {
35  ScopedLock lock( &cuda_devices::s_mutex );
36  cuda_devices::s_cuda_devices = new cuda_devices();
37  }
38  return cuda_devices::s_cuda_devices;
39 }
40 
41 // get device properties (for the current device)
42 inline cudaDeviceProp get_device_properties()
43 {
44  int device;
45  cudaGetDevice( &device );
46 
47  cuda_devices* devices = cuda_devices::get();
48 
49  return devices->properties[ device ];
50 }
51 
52 // granularity of shared memory allocation
53 inline void device_arch(uint32& major, uint32& minor)
54 {
55  cudaDeviceProp properties = get_device_properties();
56 
57  major = properties.major;
58  minor = properties.minor;
59 }
60 
61 // granularity of the maximum grid size
62 inline uint32 max_grid_size()
63 {
64  uint32 major, minor;
65  device_arch( major, minor );
66  return major <= 2 ? 32*1024 : uint32(-1);
67 }
68 
69 // number of multiprocessors (for the current device)
70 inline size_t multiprocessor_count()
71 {
72  cudaDeviceProp properties = get_device_properties();
73 
74  return properties.multiProcessorCount;
75 }
76 
77 // granularity of shared memory allocation
78 inline size_t smem_allocation_unit(const cudaDeviceProp &properties)
79 {
80  switch(properties.major)
81  {
82  case 1: return 512;
83  case 2: return 128;
84  case 3: return 256;
85  default: return 256; // unknown GPU; have to guess
86  }
87 }
88 // granularity of register allocation
89 inline size_t reg_allocation_unit(const cudaDeviceProp& properties, const size_t regsPerThread)
90 {
91  switch(properties.major)
92  {
93  case 1: return (properties.minor <= 1) ? 256 : 512;
94  case 2: switch(regsPerThread)
95  {
96  case 21:
97  case 22:
98  case 29:
99  case 30:
100  case 37:
101  case 38:
102  case 45:
103  case 46:
104  return 128;
105  default:
106  return 64;
107  }
108  case 3: return 256;
109  default: return 256; // unknown GPU; have to guess
110  }
111 }
112 
113 
114 // granularity of warp allocation
115 inline size_t warp_allocation_multiple(const cudaDeviceProp& properties)
116 {
117  return (properties.major <= 1) ? 2 : 1;
118 }
119 
120 // number of "sides" into which the multiprocessor is partitioned
121 inline size_t num_sides_per_multiprocessor(const cudaDeviceProp& properties)
122 {
123  switch (properties.major)
124  {
125  case 1: return 1;
126  case 2: return 2;
127  case 3: return 4;
128  default: return 4; // unknown GPU; have to guess
129  }
130 }
131 
132 template <typename KernelFunction>
133 inline cudaFuncAttributes function_attributes(KernelFunction kernel)
134 {
135  cudaFuncAttributes attributes;
136 
137 #ifdef __CUDACC__
138  typedef void (*fun_ptr_type)();
139 
140  fun_ptr_type fun_ptr = (fun_ptr_type)(kernel);
141 
142  cudaFuncGetAttributes(&attributes, fun_ptr);
143 #endif
144  return attributes;
145 }
146 
147 // maximum number of blocks per multiprocessor
148 inline size_t max_blocks_per_multiprocessor(const cudaDeviceProp& properties)
149 {
150  return properties.major <= 2 ? 8 : 16;
151 }
152 
153 // number of registers allocated per block
154 inline size_t num_regs_per_block(const cudaDeviceProp& properties, const cudaFuncAttributes& attributes, const size_t CTA_SIZE)
155 {
156  const size_t maxBlocksPerSM = max_blocks_per_multiprocessor(properties);
157  const size_t regAllocationUnit = reg_allocation_unit(properties, attributes.numRegs);
158  const size_t warpAllocationMultiple = warp_allocation_multiple(properties);
159  const size_t numWarps = round_i(divide_ri(CTA_SIZE, properties.warpSize), warpAllocationMultiple);
160 
161  // Calc limit
162  if(properties.major <= 1)
163  {
164  // GPUs of compute capability 1.x allocate registers to CTAs
165  // Number of regs per block is regs per thread times number of warps times warp size, rounded up to allocation unit
166  const size_t regsPerCTA = round_i(attributes.numRegs * properties.warpSize * numWarps, regAllocationUnit);
167  return regsPerCTA > 0 ? properties.regsPerBlock / regsPerCTA : maxBlocksPerSM;
168  }
169  else
170  {
171  // GPUs of compute capability 2.x and higher allocate registers to warps
172  // Number of regs per warp is regs per thread times times warp size, rounded up to allocation unit
173  const size_t regsPerWarp = round_i(attributes.numRegs * properties.warpSize, regAllocationUnit);
174  const size_t numSides = num_sides_per_multiprocessor(properties);
175  const size_t numRegsPerSide = properties.regsPerBlock / numSides;
176  return regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / numWarps : maxBlocksPerSM;
177  }
178 }
179 
180 inline size_t max_active_blocks_per_multiprocessor(const cudaDeviceProp& properties,
181  const cudaFuncAttributes& attributes,
182  size_t CTA_SIZE,
183  size_t dynamic_smem_bytes)
184 {
185  // Determine the maximum number of CTAs that can be run simultaneously per SM
186  // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
187 
189  // Limits due to threads/SM or blocks/SM
191  const size_t maxThreadsPerSM = properties.maxThreadsPerMultiProcessor; // 768, 1024, 1536, etc.
192  const size_t maxBlocksPerSM = max_blocks_per_multiprocessor(properties);
193 
194  // Calc limits
195  const size_t ctaLimitThreads = (CTA_SIZE <= size_t(properties.maxThreadsPerBlock)) ? maxThreadsPerSM / CTA_SIZE : 0;
196  const size_t ctaLimitBlocks = maxBlocksPerSM;
197 
199  // Limits due to shared memory/SM
201  const size_t smemAllocationUnit = smem_allocation_unit(properties);
202  const size_t smemBytes = attributes.sharedSizeBytes + dynamic_smem_bytes;
203  const size_t smemPerCTA = round_i(smemBytes, smemAllocationUnit);
204 
205  // Calc limit
206  const size_t ctaLimitSMem = smemPerCTA > 0 ? properties.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
207 
209  // Limits due to registers/SM
211  const size_t ctaLimitRegs = num_regs_per_block( properties, attributes, CTA_SIZE );
212 
214  // Overall limit is min() of limits due to above reasons
216  return cugar::min(ctaLimitRegs, cugar::min(ctaLimitSMem, cugar::min(ctaLimitThreads, ctaLimitBlocks)));
217 }
218 
219 template <typename KernelFunction>
220 size_t max_active_blocks_per_multiprocessor(KernelFunction kernel, const size_t CTA_SIZE, const size_t dynamic_smem_bytes)
221 {
222  #if 0
223  cudaDeviceProp properties = get_device_properties();
224 
225  cudaFuncAttributes attributes = function_attributes( kernel );
226 
227  return max_active_blocks_per_multiprocessor(properties, attributes, CTA_SIZE, dynamic_smem_bytes);
228  #else
229  int maxActiveBlocks;
230  cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, kernel, int(CTA_SIZE), int(dynamic_smem_bytes) );
231  return size_t(maxActiveBlocks);
232  #endif
233 }
234 
235 template <typename KernelFunction>
236 size_t max_active_blocks(KernelFunction kernel, const size_t CTA_SIZE, const size_t dynamic_smem_bytes)
237 {
238  cudaDeviceProp properties = get_device_properties();
239 
240  cudaFuncAttributes attributes = function_attributes( kernel );
241 
242  return properties.multiProcessorCount * max_active_blocks_per_multiprocessor(properties, attributes, CTA_SIZE, dynamic_smem_bytes);
243 }
244 
245 template <typename KernelFunction>
246 size_t num_registers(KernelFunction kernel)
247 {
248  cudaFuncAttributes attributes = function_attributes( kernel );
249  return attributes.numRegs;
250 }
251 
252 inline size_t max_blocksize_with_highest_occupancy(const cudaDeviceProp& properties,
253  const cudaFuncAttributes& attributes,
254  size_t dynamic_smem_bytes_per_thread)
255 {
256  size_t max_occupancy = properties.maxThreadsPerMultiProcessor;
257  size_t largest_blocksize = cugar::min( properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock );
258  size_t granularity = properties.warpSize;
259  size_t max_blocksize = 0;
260  size_t highest_occupancy = 0;
261 
262  for(size_t blocksize = largest_blocksize; blocksize != 0; blocksize -= granularity)
263  {
264  size_t occupancy = blocksize * max_active_blocks_per_multiprocessor(properties, attributes, blocksize, dynamic_smem_bytes_per_thread * blocksize);
265 
266  if (occupancy > highest_occupancy)
267  {
268  max_blocksize = blocksize;
269  highest_occupancy = occupancy;
270  }
271 
272  // early out, can't do better
273  if (highest_occupancy == max_occupancy)
274  return max_blocksize;
275  }
276  return max_blocksize;
277 }
278 
279 // a utility unary functor to return the number of smem bytes per block, given the amount of smem bytes per thread
280 //
282 {
283  // constructor
284  CUGAR_HOST_DEVICE
285  PerThreadSmemUnaryFunction(const int _bytes_per_thread) : bytes_per_thread(_bytes_per_thread) {}
286 
287  // unary operator
288  CUGAR_HOST_DEVICE
289  int operator() (const int block_size) const { return block_size * bytes_per_thread; }
290 
291  int bytes_per_thread;
292 };
293 
294 template <typename KernelFunction>
295 size_t max_blocksize_with_highest_occupancy(KernelFunction kernel, size_t dynamic_smem_bytes_per_thread)
296 {
297 #if 0
298  cudaDeviceProp properties = get_device_properties();
299 
300  cudaFuncAttributes attributes = function_attributes( kernel );
301 
302  return max_blocksize_with_highest_occupancy(properties, attributes, dynamic_smem_bytes_per_thread);
303 #else
304  int blockSize;
305  int minGridSize;
306  cudaOccupancyMaxPotentialBlockSizeVariableSMem( &minGridSize, &blockSize, kernel, PerThreadSmemUnaryFunction(int(dynamic_smem_bytes_per_thread)));
307  return size_t(blockSize);
308 #endif
309 }
310 
311 inline bool is_tcc_enabled()
312 {
313  cudaDeviceProp properties = get_device_properties();
314 
315  return properties.tccDriver ? true : false;
316 }
317 
318 inline void check_error(const cudaError_t error, const char *message)
319 {
320  if (error != cudaSuccess)
321  {
322  const char* error_string = cudaGetErrorString(error);
323  char error_message[2048];
324  sprintf(error_message, "%s in %s", error_string, message);
325  throw cuda_error( error_message );
326  }
327 }
328 inline void check_error(const char *message)
329 {
330  check_error( cudaGetLastError(), message );
331 }
332 
333 inline void sync_and_check_error(const char *message)
334 {
335  cudaDeviceSynchronize();
336  check_error(cudaGetLastError(), message);
337 }
338 
339 // a generic syncthreads() implementation to synchronize contiguous
340 // blocks of N threads at a time
341 //
342 template <uint32 N>
343 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
345 {
346  #if defined(CUGAR_DEVICE_COMPILATION)
347  __syncthreads();
348  #endif
349 }
350 
351 } // namespace cuda
352 } // namespace cugar
CUGAR_HOST_DEVICE L round_i(const L x, const R y)
Definition: numbers.h:198
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE void syncthreads()
Definition: arch_inl.h:344
Definition: threads.h:181
CUGAR_HOST_DEVICE L divide_ri(const L x, const R y)
Definition: numbers.h:180
static cuda_devices * get()
Definition: arch_inl.h:31
Define a vector_view POD type and plain_view() for std::vector.
Definition: diff.h:38
cudaDeviceProp * properties
device properties
Definition: arch.h:62
Definition: arch_inl.h:281
Definition: arch.h:59
Definition: exceptions.h:44