NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
arch_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 namespace nvbio {
29 namespace cuda {
30 
31 // granularity of shared memory allocation
32 inline void device_arch(uint32& major, uint32& minor)
33 {
34  int device;
35  cudaGetDevice( &device );
36  cudaDeviceProp properties;
37  cudaGetDeviceProperties( &properties, device );
38 
39  major = properties.major;
40  minor = properties.minor;
41 }
42 
43 // granularity of the maximum grid size
45 {
46  uint32 major, minor;
47  device_arch( major, minor );
48  return major <= 2 ? 32*1024 : uint32(-1);
49 }
50 
51 // number of multiprocessors (for the current device)
52 inline size_t multiprocessor_count()
53 {
54  int device;
55  cudaGetDevice( &device );
56 
57  cudaDeviceProp properties;
58  cudaGetDeviceProperties( &properties, device );
59 
60  return properties.multiProcessorCount;
61 }
62 
63 // granularity of shared memory allocation
64 inline size_t smem_allocation_unit(const cudaDeviceProp& properties)
65 {
66  return 512;
67 }
68 
69 // granularity of register allocation
70 inline size_t reg_allocation_unit(const cudaDeviceProp& properties, const size_t regsPerThread)
71 {
72  switch(properties.major)
73  {
74  case 1: return (properties.minor <= 1) ? 256 : 512;
75  case 2: switch(regsPerThread)
76  {
77  case 21:
78  case 22:
79  case 29:
80  case 30:
81  case 37:
82  case 38:
83  case 45:
84  case 46:
85  return 128;
86  default:
87  return 64;
88  }
89  case 3: return 256;
90  default: return 256; // unknown GPU; have to guess
91  }
92 }
93 
94 // granularity of warp allocation
95 inline size_t warp_allocation_multiple(const cudaDeviceProp& properties)
96 {
97  return 2;
98 }
99 
100 template <typename KernelFunction>
101 inline cudaFuncAttributes function_attributes(KernelFunction kernel)
102 {
103  cudaFuncAttributes attributes;
104 
105 #ifdef __CUDACC__
106  typedef void (*fun_ptr_type)();
107 
108  fun_ptr_type fun_ptr = reinterpret_cast<fun_ptr_type>(kernel);
109 
110  cudaFuncGetAttributes(&attributes, fun_ptr);
111 #endif
112  return attributes;
113 }
114 
115 // maximum number of blocks per multiprocessor
116 inline size_t max_blocks_per_multiprocessor(const cudaDeviceProp& properties)
117 {
118  return properties.major <= 2 ? 8 : 16;
119 }
120 
121 // number of "sides" into which the multiprocessor is partitioned
122 inline size_t num_sides_per_multiprocessor(const cudaDeviceProp& properties)
123 {
124  switch(properties.major)
125  {
126  case 1: return 1;
127  case 2: return 2;
128  case 3: return 4;
129  default: return 4; // unknown GPU; have to guess
130  }
131 }
132 
133 // number of registers allocated per block
134 inline size_t num_regs_per_block(const cudaDeviceProp& properties, const cudaFuncAttributes& attributes, const size_t CTA_SIZE)
135 {
136  const size_t maxBlocksPerSM = max_blocks_per_multiprocessor(properties);
137  const size_t regAllocationUnit = reg_allocation_unit(properties, attributes.numRegs);
138  const size_t warpAllocationMultiple = warp_allocation_multiple(properties);
139 
140  // Number of warps (round up to nearest whole multiple of warp size & warp allocation multiple)
141  const size_t numWarps = util::round_i(util::divide_ri(CTA_SIZE, properties.warpSize), warpAllocationMultiple);
142 
143  if (properties.major < 2)
144  return util::round_i(attributes.numRegs * properties.warpSize * numWarps, regAllocationUnit);
145  else
146  {
147  const size_t regsPerWarp = util::round_i(attributes.numRegs * properties.warpSize, regAllocationUnit);
148  const size_t numSides = num_sides_per_multiprocessor(properties);
149  const size_t numRegsPerSide = properties.regsPerBlock / numSides;
150  return regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / numWarps : maxBlocksPerSM;
151  }
152 }
153 
154 inline size_t max_active_blocks_per_multiprocessor(const cudaDeviceProp& properties,
155  const cudaFuncAttributes& attributes,
156  size_t CTA_SIZE,
157  size_t dynamic_smem_bytes)
158 {
159  // Determine the maximum number of CTAs that can be run simultaneously per SM
160  // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
161  const size_t smemAllocationUnit = smem_allocation_unit(properties);
162  const size_t maxThreadsPerSM = properties.maxThreadsPerMultiProcessor; // 768, 1024, 1536, etc.
163  const size_t maxBlocksPerSM = max_blocks_per_multiprocessor(properties);
164 
165  // Number of regs is regs per thread times number of warps times warp size
166  const size_t regsPerCTA = num_regs_per_block( properties, attributes, CTA_SIZE );
167 
168  const size_t smemBytes = attributes.sharedSizeBytes + dynamic_smem_bytes;
169  const size_t smemPerCTA = util::round_i(smemBytes, smemAllocationUnit);
170 
171  const size_t ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerBlock / regsPerCTA : maxBlocksPerSM;
172  const size_t ctaLimitSMem = smemPerCTA > 0 ? properties.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
173  const size_t ctaLimitThreads = maxThreadsPerSM / CTA_SIZE;
174 
175  return nvbio::min( (uint32)ctaLimitRegs, nvbio::min( (uint32)ctaLimitSMem, nvbio::min((uint32)ctaLimitThreads, (uint32)maxBlocksPerSM)));
176 }
177 
178 template <typename KernelFunction>
179 size_t max_active_blocks_per_multiprocessor(KernelFunction kernel, const size_t CTA_SIZE, const size_t dynamic_smem_bytes)
180 {
181  int device;
182  cudaGetDevice( &device );
183 
184  cudaDeviceProp properties;
185  cudaGetDeviceProperties( &properties, device );
186 
187  cudaFuncAttributes attributes = function_attributes( kernel );
188 
189  return max_active_blocks_per_multiprocessor(properties, attributes, CTA_SIZE, dynamic_smem_bytes);
190 }
191 
192 template <typename KernelFunction>
193 size_t max_active_blocks(KernelFunction kernel, const size_t CTA_SIZE, const size_t dynamic_smem_bytes)
194 {
195  int device;
196  cudaGetDevice( &device );
197 
198  cudaDeviceProp properties;
199  cudaGetDeviceProperties( &properties, device );
200 
201  cudaFuncAttributes attributes = function_attributes( kernel );
202 
203  return properties.multiProcessorCount * max_active_blocks_per_multiprocessor(properties, attributes, CTA_SIZE, dynamic_smem_bytes);
204 }
205 
206 template <typename KernelFunction>
207 size_t num_registers(KernelFunction kernel)
208 {
209  cudaFuncAttributes attributes = function_attributes( kernel );
210  return attributes.numRegs;
211 }
212 
213 inline size_t max_blocksize_with_highest_occupancy(const cudaDeviceProp& properties,
214  const cudaFuncAttributes& attributes,
215  size_t dynamic_smem_bytes_per_thread)
216 {
217  size_t max_occupancy = properties.maxThreadsPerMultiProcessor;
218  size_t largest_blocksize = nvbio::min( properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock );
219  size_t granularity = properties.warpSize;
220  size_t max_blocksize = 0;
221  size_t highest_occupancy = 0;
222 
223  for(size_t blocksize = largest_blocksize; blocksize != 0; blocksize -= granularity)
224  {
225  size_t occupancy = blocksize * max_active_blocks_per_multiprocessor(properties, attributes, blocksize, dynamic_smem_bytes_per_thread * blocksize);
226 
227  if (occupancy > highest_occupancy)
228  {
229  max_blocksize = blocksize;
230  highest_occupancy = occupancy;
231  }
232 
233  // early out, can't do better
234  if (highest_occupancy == max_occupancy)
235  return max_blocksize;
236  }
237 
238  return max_blocksize;
239 }
240 
241 template <typename KernelFunction>
242 size_t max_blocksize_with_highest_occupancy(KernelFunction kernel, size_t dynamic_smem_bytes_per_thread)
243 {
244  int device;
245  cudaDeviceProp properties;
246  cudaGetDevice( &device );
247  cudaGetDeviceProperties( &properties, device );
248 
249  cudaFuncAttributes attributes = function_attributes( kernel );
250 
251  return max_blocksize_with_highest_occupancy(properties, attributes, dynamic_smem_bytes_per_thread);
252 }
253 
254 inline bool is_tcc_enabled()
255 {
256  int device;
257  cudaDeviceProp device_properties;
258  cudaGetDevice(&device);
259  cudaGetDeviceProperties( &device_properties, device );
260  return device_properties.tccDriver ? true : false;
261 }
262 
263 inline void check_error(const char *message)
264 {
265  cudaError_t error = cudaGetLastError();
266  if(error != cudaSuccess)
267  {
268  const char* error_string = cudaGetErrorString(error);
269  log_error(stderr,"%s: %s\n", message, error_string );
270  throw cuda_error( error_string );
271  }
272 }
273 
274 // a generic syncthreads() implementation to synchronize contiguous
275 // blocks of N threads at a time
276 //
277 template <uint32 N>
280 {
281  #if defined(NVBIO_DEVICE_COMPILATION)
282  __syncthreads();
283  #endif
284 }
285 
286 } // namespace cuda
287 } // namespace nvbio