CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
Classes | Enumerations
Thread and thread block I/O

Classes

class  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
 The BlockLoad class provides collective data movement methods for loading a linear segment of items from memory into a blocked arrangement across a CUDA thread block.

block_load_logo.png
.
More...
 
class  cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
 The BlockStore class provides collective data movement methods for writing a blocked arrangement of items partitioned across a CUDA thread block to a linear segment of memory.

block_store_logo.png
.
More...
 

Enumerations

enum  cub::CacheLoadModifier {
  cub::LOAD_DEFAULT, cub::LOAD_CA, cub::LOAD_CG, cub::LOAD_CS,
  cub::LOAD_CV, cub::LOAD_LDG, cub::LOAD_VOLATILE
}
 Enumeration of cache modifiers for memory load operations. More...
 
enum  cub::CacheStoreModifier {
  cub::STORE_DEFAULT, cub::STORE_WB, cub::STORE_CG, cub::STORE_CS,
  cub::STORE_WT, cub::STORE_VOLATILE
}
 Enumeration of cache modifiers for memory store operations. More...
 

Thread I/O (cache modified)

template<CacheLoadModifier MODIFIER, typename InputIteratorT >
__device__ __forceinline__
std::iterator_traits
< InputIteratorT >::value_type 
cub::ThreadLoad (InputIteratorT itr)
 Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type. More...
 
template<CacheStoreModifier MODIFIER, typename OutputIteratorT , typename T >
__device__ __forceinline__ void cub::ThreadStore (OutputIteratorT itr, T val)
 Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type. More...
 

Blocked arrangement I/O (direct)

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a blocked arrangement across the thread block. More...
 
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items into a blocked arrangement across the thread block, guarded by range. More...
 
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.. More...
 
template<typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::LoadDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a blocked arrangement across the thread block. More...
 
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectBlocked (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
 Store a blocked arrangement of items across a thread block into a linear segment of items. More...
 
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectBlocked (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
 Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range. More...
 
template<typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::StoreDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
 Store a blocked arrangement of items across a thread block into a linear segment of items. More...
 

Striped arrangement I/O (direct)

template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a striped arrangement across the thread block. More...
 
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items into a striped arrangement across the thread block, guarded by range. More...
 
template<int BLOCK_THREADS, typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. More...
 
template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
 Store a striped arrangement of data across the thread block into a linear segment of items. More...
 
template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
 Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range. More...
 

Warp-striped arrangement I/O (direct)

template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a warp-striped arrangement across the thread block. More...
 
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range. More...
 
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. More...
 
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectWarpStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD])
 Store a warp-striped arrangement of data across the thread block into a linear segment of items. More...
 
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectWarpStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items)
 Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range. More...
 

Enumeration Type Documentation

Enumeration of cache modifiers for memory load operations.

Enumerator
LOAD_DEFAULT 

Default (no modifier)

LOAD_CA 

Cache at all levels.

LOAD_CG 

Cache at global level.

LOAD_CS 

Cache streaming (likely to be accessed once)

LOAD_CV 

Cache as volatile (including cached system lines)

LOAD_LDG 

Cache as texture.

LOAD_VOLATILE 

Volatile (any memory space)

Enumeration of cache modifiers for memory store operations.

Enumerator
STORE_DEFAULT 

Default (no modifier)

STORE_WB 

Cache write-back all coherent levels.

STORE_CG 

Cache at global level.

STORE_CS 

Cache streaming (likely to be accessed once)

STORE_WT 

Cache write-through (to system memory)

STORE_VOLATILE 

Volatile shared (any memory space)

Function Documentation

template<CacheLoadModifier MODIFIER, typename InputIteratorT >
__device__ __forceinline__ std::iterator_traits<InputIteratorT>::value_type cub::ThreadLoad ( InputIteratorT  itr)

Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type.

Example
#include <cub/cub.cuh> // or equivalently <cub/thread/thread_load.cuh>
// 32-bit load using cache-global modifier:
int *d_in;
int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
// 16-bit load using default modifier
short *d_in;
short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
// 256-bit load using cache-volatile modifier
double4 *d_in;
double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
// 96-bit load using cache-streaming modifier
struct TestFoo { bool a; short b; };
TestFoo *d_struct;
TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
Template Parameters
MODIFIER[inferred] CacheLoadModifier enumeration
InputIteratorT[inferred] Input iterator type (may be a simple pointer type)
template<CacheStoreModifier MODIFIER, typename OutputIteratorT , typename T >
__device__ __forceinline__ void cub::ThreadStore ( OutputIteratorT  itr,
val 
)

Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type.

Example
#include <cub/cub.cuh> // or equivalently <cub/thread/thread_store.cuh>
// 32-bit store using cache-global modifier:
int *d_out;
int val;
cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
// 16-bit store using default modifier
short *d_out;
short val;
cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
// 256-bit store using write-through modifier
double4 *d_out;
double4 val;
cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
// 96-bit store using cache-streaming cache modifier
struct TestFoo { bool a; short b; };
TestFoo *d_struct;
TestFoo val;
cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
Template Parameters
MODIFIER[inferred] CacheStoreModifier enumeration
InputIteratorT[inferred] Output iterator type (may be a simple pointer type)
T[inferred] Data type of output value
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD] 
)

Load a linear segment of items into a blocked arrangement across the thread block.

Assumes a blocked arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns the ith range of items-per-thread contiguous items. For multi-dimensional thread blocks, a row-major thread ordering is assumed.

Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input (may be a simple pointer type).
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)

Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.

Assumes a blocked arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns the ith range of items-per-thread contiguous items. For multi-dimensional thread blocks, a row-major thread ordering is assumed.

Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input (may be a simple pointer type).
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items,
DefaultT  oob_default 
)

Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..

Assumes a blocked arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns the ith range of items-per-thread contiguous items. For multi-dimensional thread blocks, a row-major thread ordering is assumed.

Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input (may be a simple pointer type).
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load
[in]oob_defaultDefault value to assign out-of-bound items
template<typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::LoadDirectBlockedVectorized ( int  linear_tid,
T *  block_ptr,
T(&)  items[ITEMS_PER_THREAD] 
)

Load a linear segment of items into a blocked arrangement across the thread block.

Assumes a blocked arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns the ith range of items-per-thread contiguous items. For multi-dimensional thread blocks, a row-major thread ordering is assumed.

The input offset (block_ptr + block_offset) must be quad-item aligned

The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:

  • ITEMS_PER_THREAD is odd
  • The data type T is not a built-in primitive or CUDA vector type (e.g., short, int2, double, float2, etc.)
Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_ptrInput pointer for loading from
[out]itemsData to load
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD] 
)

Load a linear segment of items into a striped arrangement across the thread block.

Assumes a striped arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns items (i), (i + block-threads), ..., (i + (block-threads*(items-per-thread-1))). For multi-dimensional thread blocks, a row-major thread ordering is assumed.

Template Parameters
BLOCK_THREADSThe thread block size in threads
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input (may be a simple pointer type).
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)

Load a linear segment of items into a striped arrangement across the thread block, guarded by range.

Assumes a striped arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns items (i), (i + block-threads), ..., (i + (block-threads*(items-per-thread-1))). For multi-dimensional thread blocks, a row-major thread ordering is assumed.

Template Parameters
BLOCK_THREADSThe thread block size in threads
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input (may be a simple pointer type).
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load
template<int BLOCK_THREADS, typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items,
DefaultT  oob_default 
)

Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.

Assumes a striped arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns items (i), (i + block-threads), ..., (i + (block-threads*(items-per-thread-1))). For multi-dimensional thread blocks, a row-major thread ordering is assumed.

Template Parameters
BLOCK_THREADSThe thread block size in threads
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input (may be a simple pointer type).
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load
[in]oob_defaultDefault value to assign out-of-bound items
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD] 
)

Load a linear segment of items into a warp-striped arrangement across the thread block.

Assumes a warp-striped arrangement of elements across threads, where warpi owns the ith range of (warp-threads*items-per-thread) contiguous items, and each thread owns items (i), (i + warp-threads), ..., (i + (warp-threads*(items-per-thread-1))).

Usage Considerations
The number of threads in the thread block must be a multiple of the architecture's warp size.
Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input (may be a simple pointer type).
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)

Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range.

Assumes a warp-striped arrangement of elements across threads, where warpi owns the ith range of (warp-threads*items-per-thread) contiguous items, and each thread owns items (i), (i + warp-threads), ..., (i + (warp-threads*(items-per-thread-1))).

Usage Considerations
The number of threads in the thread block must be a multiple of the architecture's warp size.
Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input (may be a simple pointer type).
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped ( int  linear_tid,
InputIteratorT  block_itr,
InputT(&)  items[ITEMS_PER_THREAD],
int  valid_items,
DefaultT  oob_default 
)

Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.

Assumes a warp-striped arrangement of elements across threads, where warpi owns the ith range of (warp-threads*items-per-thread) contiguous items, and each thread owns items (i), (i + warp-threads), ..., (i + (warp-threads*(items-per-thread-1))).

Usage Considerations
The number of threads in the thread block must be a multiple of the architecture's warp size.
Template Parameters
T[inferred] The data type to load.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
InputIteratorT[inferred] The random-access iterator type for input (may be a simple pointer type).
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base input iterator for loading from
[out]itemsData to load
[in]valid_itemsNumber of valid items to load
[in]oob_defaultDefault value to assign out-of-bound items
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectBlocked ( int  linear_tid,
OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD] 
)

Store a blocked arrangement of items across a thread block into a linear segment of items.

Assumes a blocked arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns the ith range of items-per-thread contiguous items. For multi-dimensional thread blocks, a row-major thread ordering is assumed.

Template Parameters
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT[inferred] The random-access iterator type for output (may be a simple pointer type).
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectBlocked ( int  linear_tid,
OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)

Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range.

Assumes a blocked arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns the ith range of items-per-thread contiguous items. For multi-dimensional thread blocks, a row-major thread ordering is assumed.

Template Parameters
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT[inferred] The random-access iterator type for output (may be a simple pointer type).
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store
[in]valid_itemsNumber of valid items to write
template<typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::StoreDirectBlockedVectorized ( int  linear_tid,
T *  block_ptr,
T(&)  items[ITEMS_PER_THREAD] 
)

Store a blocked arrangement of items across a thread block into a linear segment of items.

Assumes a blocked arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns the ith range of items-per-thread contiguous items. For multi-dimensional thread blocks, a row-major thread ordering is assumed.

The output offset (block_ptr + block_offset) must be quad-item aligned, which is the default starting offset returned by cudaMalloc()

The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
  • ITEMS_PER_THREAD is odd
  • The data type T is not a built-in primitive or CUDA vector type (e.g., short, int2, double, float2, etc.)
Template Parameters
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_ptrInput pointer for storing from
[in]itemsData to store
template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectStriped ( int  linear_tid,
OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD] 
)

Store a striped arrangement of data across the thread block into a linear segment of items.

Assumes a striped arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns items (i), (i + block-threads), ..., (i + (block-threads*(items-per-thread-1))). For multi-dimensional thread blocks, a row-major thread ordering is assumed.

Template Parameters
BLOCK_THREADSThe thread block size in threads
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT[inferred] The random-access iterator type for output (may be a simple pointer type).
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store
template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectStriped ( int  linear_tid,
OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)

Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range.

Assumes a striped arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns items (i), (i + block-threads), ..., (i + (block-threads*(items-per-thread-1))). For multi-dimensional thread blocks, a row-major thread ordering is assumed.

Template Parameters
BLOCK_THREADSThe thread block size in threads
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT[inferred] The random-access iterator type for output (may be a simple pointer type).
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store
[in]valid_itemsNumber of valid items to write
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectWarpStriped ( int  linear_tid,
OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD] 
)

Store a warp-striped arrangement of data across the thread block into a linear segment of items.

Assumes a warp-striped arrangement of elements across threads, where warpi owns the ith range of (warp-threads*items-per-thread) contiguous items, and each thread owns items (i), (i + warp-threads), ..., (i + (warp-threads*(items-per-thread-1))).

Usage Considerations
The number of threads in the thread block must be a multiple of the architecture's warp size.
Template Parameters
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT[inferred] The random-access iterator type for output (may be a simple pointer type).
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base output iterator for storing to
[out]itemsData to load
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectWarpStriped ( int  linear_tid,
OutputIteratorT  block_itr,
T(&)  items[ITEMS_PER_THREAD],
int  valid_items 
)

Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range.

Assumes a warp-striped arrangement of elements across threads, where warpi owns the ith range of (warp-threads*items-per-thread) contiguous items, and each thread owns items (i), (i + warp-threads), ..., (i + (warp-threads*(items-per-thread-1))).

Usage Considerations
The number of threads in the thread block must be a multiple of the architecture's warp size.
Template Parameters
T[inferred] The data type to store.
ITEMS_PER_THREAD[inferred] The number of consecutive items partitioned onto each thread.
OutputIteratorT[inferred] The random-access iterator type for output (may be a simple pointer type).
Parameters
[in]linear_tidA suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
[in]block_itrThe thread block's base output iterator for storing to
[in]itemsData to store
[in]valid_itemsNumber of valid items to write