34 #include <cuda_runtime.h>
35 #include <thrust/version.h>
38 #include <thrust/device_vector.h>
39 #include <thrust/host_vector.h>
63 inline size_t reg_allocation_unit(
const cudaDeviceProp& properties,
const size_t regsPerThread);
75 inline size_t num_regs_per_block(
const cudaDeviceProp& properties,
const cudaFuncAttributes& attributes,
const size_t CTA_SIZE);
77 template <
typename KernelFunction>
80 template <
typename KernelFunction>
83 template <
typename KernelFunction>
84 size_t max_active_blocks(KernelFunction kernel,
const size_t CTA_SIZE,
const size_t dynamic_smem_bytes);
86 template <
typename KernelFunction>
89 template <
typename KernelFunction>