33 if (cuda_devices::s_cuda_devices == NULL)
38 return cuda_devices::s_cuda_devices;
42 inline cudaDeviceProp get_device_properties()
45 cudaGetDevice( &device );
53 inline void device_arch(uint32& major, uint32& minor)
55 cudaDeviceProp
properties = get_device_properties();
57 major = properties.major;
58 minor = properties.minor;
62 inline uint32 max_grid_size()
65 device_arch( major, minor );
66 return major <= 2 ? 32*1024 : uint32(-1);
70 inline size_t multiprocessor_count()
72 cudaDeviceProp
properties = get_device_properties();
74 return properties.multiProcessorCount;
78 inline size_t smem_allocation_unit(
const cudaDeviceProp &
properties)
80 switch(properties.major)
89 inline size_t reg_allocation_unit(
const cudaDeviceProp& properties,
const size_t regsPerThread)
91 switch(properties.major)
93 case 1:
return (properties.minor <= 1) ? 256 : 512;
94 case 2:
switch(regsPerThread)
115 inline size_t warp_allocation_multiple(
const cudaDeviceProp& properties)
117 return (properties.major <= 1) ? 2 : 1;
121 inline size_t num_sides_per_multiprocessor(
const cudaDeviceProp& properties)
123 switch (properties.major)
132 template <
typename KernelFunction>
133 inline cudaFuncAttributes function_attributes(KernelFunction kernel)
135 cudaFuncAttributes attributes;
138 typedef void (*fun_ptr_type)();
140 fun_ptr_type fun_ptr = (fun_ptr_type)(kernel);
142 cudaFuncGetAttributes(&attributes, fun_ptr);
148 inline size_t max_blocks_per_multiprocessor(
const cudaDeviceProp& properties)
150 return properties.major <= 2 ? 8 : 16;
154 inline size_t num_regs_per_block(
const cudaDeviceProp& properties,
const cudaFuncAttributes& attributes,
const size_t CTA_SIZE)
156 const size_t maxBlocksPerSM = max_blocks_per_multiprocessor(properties);
157 const size_t regAllocationUnit = reg_allocation_unit(properties, attributes.numRegs);
158 const size_t warpAllocationMultiple = warp_allocation_multiple(properties);
159 const size_t numWarps =
round_i(
divide_ri(CTA_SIZE, properties.warpSize), warpAllocationMultiple);
162 if(properties.major <= 1)
166 const size_t regsPerCTA =
round_i(attributes.numRegs * properties.warpSize * numWarps, regAllocationUnit);
167 return regsPerCTA > 0 ? properties.regsPerBlock / regsPerCTA : maxBlocksPerSM;
173 const size_t regsPerWarp =
round_i(attributes.numRegs * properties.warpSize, regAllocationUnit);
174 const size_t numSides = num_sides_per_multiprocessor(properties);
175 const size_t numRegsPerSide = properties.regsPerBlock / numSides;
176 return regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / numWarps : maxBlocksPerSM;
180 inline size_t max_active_blocks_per_multiprocessor(
const cudaDeviceProp& properties,
181 const cudaFuncAttributes& attributes,
183 size_t dynamic_smem_bytes)
191 const size_t maxThreadsPerSM = properties.maxThreadsPerMultiProcessor;
192 const size_t maxBlocksPerSM = max_blocks_per_multiprocessor(properties);
195 const size_t ctaLimitThreads = (CTA_SIZE <= size_t(properties.maxThreadsPerBlock)) ? maxThreadsPerSM / CTA_SIZE : 0;
196 const size_t ctaLimitBlocks = maxBlocksPerSM;
201 const size_t smemAllocationUnit = smem_allocation_unit(properties);
202 const size_t smemBytes = attributes.sharedSizeBytes + dynamic_smem_bytes;
203 const size_t smemPerCTA =
round_i(smemBytes, smemAllocationUnit);
206 const size_t ctaLimitSMem = smemPerCTA > 0 ? properties.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
211 const size_t ctaLimitRegs = num_regs_per_block( properties, attributes, CTA_SIZE );
216 return cugar::min(ctaLimitRegs, cugar::min(ctaLimitSMem, cugar::min(ctaLimitThreads, ctaLimitBlocks)));
219 template <
typename KernelFunction>
220 size_t max_active_blocks_per_multiprocessor(KernelFunction kernel,
const size_t CTA_SIZE,
const size_t dynamic_smem_bytes)
223 cudaDeviceProp properties = get_device_properties();
225 cudaFuncAttributes attributes = function_attributes( kernel );
227 return max_active_blocks_per_multiprocessor(properties, attributes, CTA_SIZE, dynamic_smem_bytes);
230 cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, kernel,
int(CTA_SIZE),
int(dynamic_smem_bytes) );
231 return size_t(maxActiveBlocks);
235 template <
typename KernelFunction>
236 size_t max_active_blocks(KernelFunction kernel,
const size_t CTA_SIZE,
const size_t dynamic_smem_bytes)
238 cudaDeviceProp properties = get_device_properties();
240 cudaFuncAttributes attributes = function_attributes( kernel );
242 return properties.multiProcessorCount * max_active_blocks_per_multiprocessor(properties, attributes, CTA_SIZE, dynamic_smem_bytes);
245 template <
typename KernelFunction>
246 size_t num_registers(KernelFunction kernel)
248 cudaFuncAttributes attributes = function_attributes( kernel );
249 return attributes.numRegs;
252 inline size_t max_blocksize_with_highest_occupancy(
const cudaDeviceProp& properties,
253 const cudaFuncAttributes& attributes,
254 size_t dynamic_smem_bytes_per_thread)
256 size_t max_occupancy = properties.maxThreadsPerMultiProcessor;
257 size_t largest_blocksize = cugar::min( properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock );
258 size_t granularity = properties.warpSize;
259 size_t max_blocksize = 0;
260 size_t highest_occupancy = 0;
262 for(
size_t blocksize = largest_blocksize; blocksize != 0; blocksize -= granularity)
264 size_t occupancy = blocksize * max_active_blocks_per_multiprocessor(properties, attributes, blocksize, dynamic_smem_bytes_per_thread * blocksize);
266 if (occupancy > highest_occupancy)
268 max_blocksize = blocksize;
269 highest_occupancy = occupancy;
273 if (highest_occupancy == max_occupancy)
274 return max_blocksize;
276 return max_blocksize;
289 int operator() (
const int block_size)
const {
return block_size * bytes_per_thread; }
291 int bytes_per_thread;
294 template <
typename KernelFunction>
295 size_t max_blocksize_with_highest_occupancy(KernelFunction kernel,
size_t dynamic_smem_bytes_per_thread)
298 cudaDeviceProp properties = get_device_properties();
300 cudaFuncAttributes attributes = function_attributes( kernel );
302 return max_blocksize_with_highest_occupancy(properties, attributes, dynamic_smem_bytes_per_thread);
306 cudaOccupancyMaxPotentialBlockSizeVariableSMem( &minGridSize, &blockSize, kernel,
PerThreadSmemUnaryFunction(
int(dynamic_smem_bytes_per_thread)));
307 return size_t(blockSize);
311 inline bool is_tcc_enabled()
313 cudaDeviceProp properties = get_device_properties();
315 return properties.tccDriver ? true :
false;
318 inline void check_error(
const cudaError_t error,
const char *message)
320 if (error != cudaSuccess)
322 const char* error_string = cudaGetErrorString(error);
323 char error_message[2048];
324 sprintf(error_message,
"%s in %s", error_string, message);
328 inline void check_error(
const char *message)
330 check_error( cudaGetLastError(), message );
333 inline void sync_and_check_error(
const char *message)
335 cudaDeviceSynchronize();
336 check_error(cudaGetLastError(), message);
343 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
346 #if defined(CUGAR_DEVICE_COMPILATION) CUGAR_HOST_DEVICE L round_i(const L x, const R y)
Definition: numbers.h:198
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE void syncthreads()
Definition: arch_inl.h:344
Definition: threads.h:181
CUGAR_HOST_DEVICE L divide_ri(const L x, const R y)
Definition: numbers.h:180
static cuda_devices * get()
Definition: arch_inl.h:31
Define a vector_view POD type and plain_view() for std::vector.
Definition: diff.h:38
cudaDeviceProp * properties
device properties
Definition: arch.h:62
Definition: arch_inl.h:281
Definition: exceptions.h:44