CUB  
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Functions
PTX intrinsics

Functions

__device__ __forceinline__
unsigned int 
cub::SHR_ADD (unsigned int x, unsigned int shift, unsigned int addend)
 Shift-right then add. Returns (x >> shift) + addend. More...
 
__device__ __forceinline__
unsigned int 
cub::SHL_ADD (unsigned int x, unsigned int shift, unsigned int addend)
 Shift-left then add. Returns (x << shift) + addend. More...
 
template<typename UnsignedBits >
__device__ __forceinline__
unsigned int 
cub::BFE (UnsignedBits source, unsigned int bit_start, unsigned int num_bits)
 Bitfield-extract. Extracts num_bits from source starting at bit-offset bit_start. The input source may be an 8b, 16b, 32b, or 64b unsigned integer type. More...
 
__device__ __forceinline__ void cub::BFI (unsigned int &ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits)
 Bitfield insert. Inserts the num_bits least significant bits of y into x at bit-offset bit_start. More...
 
__device__ __forceinline__
unsigned int 
cub::IADD3 (unsigned int x, unsigned int y, unsigned int z)
 Three-operand add. Returns x + y + z. More...
 
__device__ __forceinline__ int cub::PRMT (unsigned int a, unsigned int b, unsigned int index)
 Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later. More...
 
__device__ __forceinline__ void cub::ThreadExit ()
 Terminates the calling thread. More...
 
__device__ __forceinline__ void cub::ThreadTrap ()
 Abort execution and generate an interrupt to the host CPU. More...
 
__device__ __forceinline__ int cub::RowMajorTid (int block_dim_x, int block_dim_y, int block_dim_z)
 Returns the row-major linear thread identifier for a multidimensional thread block. More...
 
__device__ __forceinline__
unsigned int 
cub::LaneId ()
 Returns the warp lane ID of the calling thread. More...
 
__device__ __forceinline__
unsigned int 
cub::WarpId ()
 Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block. More...
 
template<int LOGICAL_WARP_THREADS, int PTX_ARCH = CUB_PTX_ARCH>
__host__ __device__
__forceinline__ unsigned int 
cub::WarpMask (unsigned int warp_id)
 Returns the warp mask for a warp of LOGICAL_WARP_THREADS threads. More...
 
__device__ __forceinline__
unsigned int 
cub::LaneMaskLt ()
 Returns the warp lane mask of all lanes less than the calling thread. More...
 
__device__ __forceinline__
unsigned int 
cub::LaneMaskLe ()
 Returns the warp lane mask of all lanes less than or equal to the calling thread. More...
 
__device__ __forceinline__
unsigned int 
cub::LaneMaskGt ()
 Returns the warp lane mask of all lanes greater than the calling thread. More...
 
__device__ __forceinline__
unsigned int 
cub::LaneMaskGe ()
 Returns the warp lane mask of all lanes greater than or equal to the calling thread. More...
 

Function Documentation

__device__ __forceinline__ unsigned int cub::SHR_ADD ( unsigned int  x,
unsigned int  shift,
unsigned int  addend 
)

Shift-right then add. Returns (x >> shift) + addend.

__device__ __forceinline__ unsigned int cub::SHL_ADD ( unsigned int  x,
unsigned int  shift,
unsigned int  addend 
)

Shift-left then add. Returns (x << shift) + addend.

template<typename UnsignedBits >
__device__ __forceinline__ unsigned int cub::BFE ( UnsignedBits  source,
unsigned int  bit_start,
unsigned int  num_bits 
)

Bitfield-extract. Extracts num_bits from source starting at bit-offset bit_start. The input source may be an 8b, 16b, 32b, or 64b unsigned integer type.

__device__ __forceinline__ void cub::BFI ( unsigned int &  ret,
unsigned int  x,
unsigned int  y,
unsigned int  bit_start,
unsigned int  num_bits 
)

Bitfield insert. Inserts the num_bits least significant bits of y into x at bit-offset bit_start.

__device__ __forceinline__ unsigned int cub::IADD3 ( unsigned int  x,
unsigned int  y,
unsigned int  z 
)

Three-operand add. Returns x + y + z.

__device__ __forceinline__ int cub::PRMT ( unsigned int  a,
unsigned int  b,
unsigned int  index 
)

Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later.

The bytes in the two source registers a and b are numbered from 0 to 7: {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within the four lower "nibbles" of index: {index } = {n7, n6, n5, n4, n3, n2, n1, n0}
Snippet
The code snippet below illustrates byte-permute.
#include <cub/cub.cuh>
__global__ void ExampleKernel(...)
{
int a = 0x03020100;
int b = 0x07060504;
int index = 0x00007531;
int selected = PRMT(a, b, index); // 0x07050301
__device__ __forceinline__ void cub::ThreadExit ( )

Terminates the calling thread.

__device__ __forceinline__ void cub::ThreadTrap ( )

Abort execution and generate an interrupt to the host CPU.

__device__ __forceinline__ int cub::RowMajorTid ( int  block_dim_x,
int  block_dim_y,
int  block_dim_z 
)

Returns the row-major linear thread identifier for a multidimensional thread block.

__device__ __forceinline__ unsigned int cub::LaneId ( )

Returns the warp lane ID of the calling thread.

__device__ __forceinline__ unsigned int cub::WarpId ( )

Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block.

template<int LOGICAL_WARP_THREADS, int PTX_ARCH = CUB_PTX_ARCH>
__host__ __device__ __forceinline__ unsigned int cub::WarpMask ( unsigned int  warp_id)

Returns the warp mask for a warp of LOGICAL_WARP_THREADS threads.

If the number of threads assigned to the virtual warp is not a power of two, it's assumed that only one virtual warp exists.
Template Parameters
LOGICAL_WARP_THREADS[optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads).
Parameters
warp_idId of virtual warp within architectural warp
__device__ __forceinline__ unsigned int cub::LaneMaskLt ( )

Returns the warp lane mask of all lanes less than the calling thread.

__device__ __forceinline__ unsigned int cub::LaneMaskLe ( )

Returns the warp lane mask of all lanes less than or equal to the calling thread.

__device__ __forceinline__ unsigned int cub::LaneMaskGt ( )

Returns the warp lane mask of all lanes greater than the calling thread.

__device__ __forceinline__ unsigned int cub::LaneMaskGe ( )

Returns the warp lane mask of all lanes greater than or equal to the calling thread.