35 cudaGetDevice( &device );
36 cudaDeviceProp properties;
37 cudaGetDeviceProperties( &properties, device );
39 major = properties.major;
40 minor = properties.minor;
48 return major <= 2 ? 32*1024 :
uint32(-1);
55 cudaGetDevice( &device );
57 cudaDeviceProp properties;
58 cudaGetDeviceProperties( &properties, device );
60 return properties.multiProcessorCount;
72 switch(properties.major)
74 case 1:
return (properties.minor <= 1) ? 256 : 512;
75 case 2:
switch(regsPerThread)
100 template <
typename KernelFunction>
103 cudaFuncAttributes attributes;
106 typedef void (*fun_ptr_type)();
108 fun_ptr_type fun_ptr =
reinterpret_cast<fun_ptr_type
>(kernel);
110 cudaFuncGetAttributes(&attributes, fun_ptr);
118 return properties.major <= 2 ? 8 : 16;
124 switch(properties.major)
134 inline size_t num_regs_per_block(
const cudaDeviceProp& properties,
const cudaFuncAttributes& attributes,
const size_t CTA_SIZE)
143 if (properties.major < 2)
144 return util::round_i(attributes.numRegs * properties.warpSize * numWarps, regAllocationUnit);
147 const size_t regsPerWarp =
util::round_i(attributes.numRegs * properties.warpSize, regAllocationUnit);
149 const size_t numRegsPerSide = properties.regsPerBlock / numSides;
150 return regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / numWarps : maxBlocksPerSM;
155 const cudaFuncAttributes& attributes,
157 size_t dynamic_smem_bytes)
162 const size_t maxThreadsPerSM = properties.maxThreadsPerMultiProcessor;
168 const size_t smemBytes = attributes.sharedSizeBytes + dynamic_smem_bytes;
169 const size_t smemPerCTA =
util::round_i(smemBytes, smemAllocationUnit);
171 const size_t ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerBlock / regsPerCTA : maxBlocksPerSM;
172 const size_t ctaLimitSMem = smemPerCTA > 0 ? properties.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
173 const size_t ctaLimitThreads = maxThreadsPerSM / CTA_SIZE;
178 template <
typename KernelFunction>
182 cudaGetDevice( &device );
184 cudaDeviceProp properties;
185 cudaGetDeviceProperties( &properties, device );
192 template <
typename KernelFunction>
193 size_t max_active_blocks(KernelFunction kernel,
const size_t CTA_SIZE,
const size_t dynamic_smem_bytes)
196 cudaGetDevice( &device );
198 cudaDeviceProp properties;
199 cudaGetDeviceProperties( &properties, device );
206 template <
typename KernelFunction>
210 return attributes.numRegs;
214 const cudaFuncAttributes& attributes,
215 size_t dynamic_smem_bytes_per_thread)
217 size_t max_occupancy = properties.maxThreadsPerMultiProcessor;
218 size_t largest_blocksize =
nvbio::min( properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock );
219 size_t granularity = properties.warpSize;
220 size_t max_blocksize = 0;
221 size_t highest_occupancy = 0;
223 for(
size_t blocksize = largest_blocksize; blocksize != 0; blocksize -= granularity)
227 if (occupancy > highest_occupancy)
229 max_blocksize = blocksize;
230 highest_occupancy = occupancy;
234 if (highest_occupancy == max_occupancy)
235 return max_blocksize;
238 return max_blocksize;
241 template <
typename KernelFunction>
245 cudaDeviceProp properties;
246 cudaGetDevice( &device );
247 cudaGetDeviceProperties( &properties, device );
257 cudaDeviceProp device_properties;
258 cudaGetDevice(&device);
259 cudaGetDeviceProperties( &device_properties, device );
260 return device_properties.tccDriver ?
true :
false;
265 cudaError_t error = cudaGetLastError();
266 if(error != cudaSuccess)
268 const char* error_string = cudaGetErrorString(error);
269 log_error(stderr,
"%s: %s\n", message, error_string );
281 #if defined(NVBIO_DEVICE_COMPILATION)