CUB  
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Classes | Macros | Functions
Device, kernel, and storage management

Classes

struct  cub::CachingDeviceAllocator
 A simple caching allocator for device memory allocations. More...
 
struct  cub::SwitchDevice
 RAII helper which saves the current device and switches to the specified device on construction and switches to the saved device on destruction. More...
 
struct  cub::KernelConfig
 
struct  cub::ChainedPolicy< PTX_VERSION, PolicyT, PrevPolicyT >
 Helper for dispatching into a policy chain. More...
 
struct  cub::ChainedPolicy< PTX_VERSION, PolicyT, PolicyT >
 Helper for dispatching into a policy chain (end-of-chain specialization) More...
 

Macros

#define CubDebug(e)   CUB_NS_QUALIFIER::Debug((cudaError_t) (e), __FILE__, __LINE__)
 Debug macro. More...
 
#define CubDebugExit(e)   if (CUB_NS_QUALIFIER::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); }
 Debug macro with exit. More...
 
#define _CubLog(format,...)   printf(format,__VA_ARGS__);
 Log macro for printf statements. More...
 

Functions

__host__ __device__
__forceinline__ cudaError_t 
cub::Debug (cudaError_t error, const char *filename, int line)
 CUB error reporting macro (prints error messages to stderr) More...
 
CUB_RUNTIME_FUNCTION int cub::CurrentDevice ()
 Returns the current device or -1 if an error occurred. More...
 
CUB_RUNTIME_FUNCTION int cub::DeviceCountUncached ()
 Returns the number of CUDA devices available or -1 if an error occurred. More...
 
CUB_RUNTIME_FUNCTION int cub::DeviceCount ()
 Returns the number of CUDA devices available. More...
 
CUB_RUNTIME_FUNCTION cudaError_t cub::PtxVersionUncached (int &ptx_version)
 Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10). More...
 
__host__ cudaError_t cub::PtxVersionUncached (int &ptx_version, int device)
 Retrieves the PTX version that will be used on device (major * 100 + minor * 10). More...
 
__host__ cudaError_t cub::PtxVersion (int &ptx_version, int device)
 Retrieves the PTX version that will be used on device (major * 100 + minor * 10). More...
 
CUB_RUNTIME_FUNCTION cudaError_t cub::PtxVersion (int &ptx_version)
 Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10). More...
 
CUB_RUNTIME_FUNCTION cudaError_t cub::SmVersionUncached (int &sm_version, int device=CurrentDevice())
 Retrieves the SM version of device (major * 100 + minor * 10) More...
 
CUB_RUNTIME_FUNCTION cudaError_t cub::SmVersion (int &sm_version, int device=CurrentDevice())
 Retrieves the SM version of device (major * 100 + minor * 10) More...
 
CUB_RUNTIME_FUNCTION cudaError_t cub::SyncStream (cudaStream_t stream)
 
template<typename KernelPtr >
CUB_RUNTIME_FUNCTION cudaError_t cub::MaxSmOccupancy (int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes=0)
 Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel_ptr on the current device with block_threads per thread block. More...
 

Macro Definition Documentation

#define CubDebug (   e)    CUB_NS_QUALIFIER::Debug((cudaError_t) (e), __FILE__, __LINE__)

Debug macro.

#define CubDebugExit (   e)    if (CUB_NS_QUALIFIER::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); }
#define _CubLog (   format,
  ... 
)    printf(format,__VA_ARGS__);

Log macro for printf statements.

Function Documentation

__host__ __device__ __forceinline__ cudaError_t cub::Debug ( cudaError_t  error,
const char *  filename,
int  line 
)

CUB error reporting macro (prints error messages to stderr)

If CUB_STDERR is defined and error is not cudaSuccess, the corresponding error message is printed to stderr (or stdout in device code) along with the supplied source context.

Returns
The CUDA error.
CUB_RUNTIME_FUNCTION int cub::CurrentDevice ( )
inline

Returns the current device or -1 if an error occurred.

CUB_RUNTIME_FUNCTION int cub::DeviceCountUncached ( )
inline

Returns the number of CUDA devices available or -1 if an error occurred.

CUB_RUNTIME_FUNCTION int cub::DeviceCount ( )
inline

Returns the number of CUDA devices available.

Note
This function may cache the result internally.
This function is thread safe.
CUB_RUNTIME_FUNCTION cudaError_t cub::PtxVersionUncached ( int &  ptx_version)
inline

Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).

__host__ cudaError_t cub::PtxVersionUncached ( int &  ptx_version,
int  device 
)
inline

Retrieves the PTX version that will be used on device (major * 100 + minor * 10).

__host__ cudaError_t cub::PtxVersion ( int &  ptx_version,
int  device 
)
inline

Retrieves the PTX version that will be used on device (major * 100 + minor * 10).

Note
This function may cache the result internally.
This function is thread safe.
CUB_RUNTIME_FUNCTION cudaError_t cub::PtxVersion ( int &  ptx_version)
inline

Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).

Note
This function may cache the result internally.
This function is thread safe.
CUB_RUNTIME_FUNCTION cudaError_t cub::SmVersionUncached ( int &  sm_version,
int  device = CurrentDevice() 
)
inline

Retrieves the SM version of device (major * 100 + minor * 10)

CUB_RUNTIME_FUNCTION cudaError_t cub::SmVersion ( int &  sm_version,
int  device = CurrentDevice() 
)
inline

Retrieves the SM version of device (major * 100 + minor * 10)

Note
This function may cache the result internally.
This function is thread safe.
CUB_RUNTIME_FUNCTION cudaError_t cub::SyncStream ( cudaStream_t  stream)
inline

Synchronize the specified stream.

template<typename KernelPtr >
CUB_RUNTIME_FUNCTION cudaError_t cub::MaxSmOccupancy ( int &  max_sm_occupancy,
KernelPtr  kernel_ptr,
int  block_threads,
int  dynamic_smem_bytes = 0 
)
inline

Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel_ptr on the current device with block_threads per thread block.

Snippet
The code snippet below illustrates the use of the MaxSmOccupancy function.
#include <cub/cub.cuh> // or equivalently <cub/util_device.cuh>
template <typename T>
__global__ void ExampleKernel()
{
// Allocate shared memory for BlockScan
__shared__ volatile T buffer[4096];
...
}
...
// Determine SM occupancy for ExampleKernel specialized for unsigned char
int max_sm_occupancy;
MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
// max_sm_occupancy <-- 4 on SM10
// max_sm_occupancy <-- 8 on SM20
// max_sm_occupancy <-- 12 on SM35
Parameters
[out]max_sm_occupancymaximum number of thread blocks that can reside on a single SM
[in]kernel_ptrKernel pointer for which to compute SM occupancy
[in]block_threadsNumber of threads per thread block
[in]dynamic_smem_bytesDynamically allocated shared memory in bytes. Default is 0.
Examples:
example_block_radix_sort.cu, example_block_reduce.cu, and example_block_scan.cu.