|
class | cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > |
| The BlockLoad class provides collective data movement methods for loading a linear segment of items from memory into a blocked arrangement across a CUDA thread block.
.
More...
|
|
class | cub::BlockStore< T, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH > |
| The BlockStore class provides collective data movement methods for writing a blocked arrangement of items partitioned across a CUDA thread block to a linear segment of memory.
.
More...
|
|
class | WarpLoad< InputT, ITEMS_PER_THREAD, ALGORITHM, LOGICAL_WARP_THREADS, PTX_ARCH > |
| The WarpLoad class provides collective data movement methods for loading a linear segment of items from memory into a blocked arrangement across a CUDA thread block. More...
|
|
class | WarpStore< T, ITEMS_PER_THREAD, ALGORITHM, LOGICAL_WARP_THREADS, PTX_ARCH > |
| The WarpStore class provides collective data movement methods for writing a blocked arrangement of items partitioned across a CUDA warp to a linear segment of memory. More...
|
|
|
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > |
__device__ __forceinline__ void | cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
| Load a linear segment of items into a blocked arrangement across the thread block. More...
|
|
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > |
__device__ __forceinline__ void | cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
| Load a linear segment of items into a blocked arrangement across the thread block, guarded by range. More...
|
|
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT > |
__device__ __forceinline__ void | cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
| Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.. More...
|
|
template<typename T , int ITEMS_PER_THREAD> |
__device__ __forceinline__ void | cub::LoadDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD]) |
| Load a linear segment of items into a blocked arrangement across the thread block. More...
|
|
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT > |
__device__ __forceinline__ void | cub::StoreDirectBlocked (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD]) |
| Store a blocked arrangement of items across a thread block into a linear segment of items. More...
|
|
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT > |
__device__ __forceinline__ void | cub::StoreDirectBlocked (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items) |
| Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range. More...
|
|
template<typename T , int ITEMS_PER_THREAD> |
__device__ __forceinline__ void | cub::StoreDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD]) |
| Store a blocked arrangement of items across a thread block into a linear segment of items. More...
|
|
|
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > |
__device__ __forceinline__ void | cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
| Load a linear segment of items into a striped arrangement across the thread block. More...
|
|
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > |
__device__ __forceinline__ void | cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
| Load a linear segment of items into a striped arrangement across the thread block, guarded by range. More...
|
|
template<int BLOCK_THREADS, typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT > |
__device__ __forceinline__ void | cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
| Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. More...
|
|
template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT > |
__device__ __forceinline__ void | cub::StoreDirectStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD]) |
| Store a striped arrangement of data across the thread block into a linear segment of items. More...
|
|
template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT > |
__device__ __forceinline__ void | cub::StoreDirectStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items) |
| Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range. More...
|
|
|
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > |
__device__ __forceinline__ void | cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD]) |
| Load a linear segment of items into a warp-striped arrangement across the thread block. More...
|
|
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT > |
__device__ __forceinline__ void | cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items) |
| Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range. More...
|
|
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT > |
__device__ __forceinline__ void | cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) |
| Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. More...
|
|
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT > |
__device__ __forceinline__ void | cub::StoreDirectWarpStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD]) |
| Store a warp-striped arrangement of data across the thread block into a linear segment of items. More...
|
|
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT > |
__device__ __forceinline__ void | cub::StoreDirectWarpStriped (int linear_tid, OutputIteratorT block_itr, T(&items)[ITEMS_PER_THREAD], int valid_items) |
| Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range. More...
|
|
|
template<CacheLoadModifier MODIFIER, typename InputIteratorT > |
__device__ __forceinline__
cub::detail::value_t
< InputIteratorT > | cub::ThreadLoad (InputIteratorT itr) |
| Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type. More...
|
|
template<CacheStoreModifier MODIFIER, typename OutputIteratorT , typename T > |
__device__ __forceinline__ void | cub::ThreadStore (OutputIteratorT itr, T val) |
| Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type. More...
|
|
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked |
( |
int |
linear_tid, |
|
|
InputIteratorT |
block_itr, |
|
|
InputT(&) |
items[ITEMS_PER_THREAD] |
|
) |
| |
Load a linear segment of items into a blocked arrangement across the thread block.
Assumes a blocked arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns the ith range of items-per-thread contiguous items. For multi-dimensional thread blocks, a row-major thread ordering is assumed.
- Template Parameters
-
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input (may be a simple pointer type). |
- Parameters
-
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked |
( |
int |
linear_tid, |
|
|
InputIteratorT |
block_itr, |
|
|
InputT(&) |
items[ITEMS_PER_THREAD], |
|
|
int |
valid_items |
|
) |
| |
Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
Assumes a blocked arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns the ith range of items-per-thread contiguous items. For multi-dimensional thread blocks, a row-major thread ordering is assumed.
- Template Parameters
-
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input (may be a simple pointer type). |
- Parameters
-
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked |
( |
int |
linear_tid, |
|
|
InputIteratorT |
block_itr, |
|
|
InputT(&) |
items[ITEMS_PER_THREAD], |
|
|
int |
valid_items, |
|
|
DefaultT |
oob_default |
|
) |
| |
Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
Assumes a blocked arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns the ith range of items-per-thread contiguous items. For multi-dimensional thread blocks, a row-major thread ordering is assumed.
- Template Parameters
-
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input (may be a simple pointer type). |
- Parameters
-
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
[in] | oob_default | Default value to assign out-of-bound items |
template<typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::LoadDirectBlockedVectorized |
( |
int |
linear_tid, |
|
|
T * |
block_ptr, |
|
|
T(&) |
items[ITEMS_PER_THREAD] |
|
) |
| |
Load a linear segment of items into a blocked arrangement across the thread block.
Assumes a blocked arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns the ith range of items-per-thread contiguous items. For multi-dimensional thread blocks, a row-major thread ordering is assumed.
The input offset (block_ptr
+ block_offset
) must be quad-item aligned
The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
ITEMS_PER_THREAD
is odd
- The data type
T
is not a built-in primitive or CUDA vector type (e.g., short
, int2
, double
, float2
, etc.)
- Template Parameters
-
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
- Parameters
-
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_ptr | Input pointer for loading from |
[out] | items | Data to load |
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped |
( |
int |
linear_tid, |
|
|
InputIteratorT |
block_itr, |
|
|
InputT(&) |
items[ITEMS_PER_THREAD] |
|
) |
| |
Load a linear segment of items into a striped arrangement across the thread block.
Assumes a striped arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns items (i), (i + block-threads), ..., (i + (block-threads*(items-per-thread-1))). For multi-dimensional thread blocks, a row-major thread ordering is assumed.
- Template Parameters
-
BLOCK_THREADS | The thread block size in threads |
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input (may be a simple pointer type). |
- Parameters
-
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped |
( |
int |
linear_tid, |
|
|
InputIteratorT |
block_itr, |
|
|
InputT(&) |
items[ITEMS_PER_THREAD], |
|
|
int |
valid_items |
|
) |
| |
Load a linear segment of items into a striped arrangement across the thread block, guarded by range.
Assumes a striped arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns items (i), (i + block-threads), ..., (i + (block-threads*(items-per-thread-1))). For multi-dimensional thread blocks, a row-major thread ordering is assumed.
- Template Parameters
-
BLOCK_THREADS | The thread block size in threads |
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input (may be a simple pointer type). |
- Parameters
-
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
template<int BLOCK_THREADS, typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped |
( |
int |
linear_tid, |
|
|
InputIteratorT |
block_itr, |
|
|
InputT(&) |
items[ITEMS_PER_THREAD], |
|
|
int |
valid_items, |
|
|
DefaultT |
oob_default |
|
) |
| |
Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
Assumes a striped arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns items (i), (i + block-threads), ..., (i + (block-threads*(items-per-thread-1))). For multi-dimensional thread blocks, a row-major thread ordering is assumed.
- Template Parameters
-
BLOCK_THREADS | The thread block size in threads |
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input (may be a simple pointer type). |
- Parameters
-
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
[in] | oob_default | Default value to assign out-of-bound items |
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped |
( |
int |
linear_tid, |
|
|
InputIteratorT |
block_itr, |
|
|
InputT(&) |
items[ITEMS_PER_THREAD] |
|
) |
| |
Load a linear segment of items into a warp-striped arrangement across the thread block.
Assumes a warp-striped arrangement of elements across threads, where warpi owns the ith range of (warp-threads*items-per-thread) contiguous items, and each thread owns items (i), (i + warp-threads), ..., (i + (warp-threads*(items-per-thread-1))).
- Usage Considerations
- The number of threads in the thread block must be a multiple of the architecture's warp size.
- Template Parameters
-
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input (may be a simple pointer type). |
- Parameters
-
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped |
( |
int |
linear_tid, |
|
|
InputIteratorT |
block_itr, |
|
|
InputT(&) |
items[ITEMS_PER_THREAD], |
|
|
int |
valid_items |
|
) |
| |
Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range.
Assumes a warp-striped arrangement of elements across threads, where warpi owns the ith range of (warp-threads*items-per-thread) contiguous items, and each thread owns items (i), (i + warp-threads), ..., (i + (warp-threads*(items-per-thread-1))).
- Usage Considerations
- The number of threads in the thread block must be a multiple of the architecture's warp size.
- Template Parameters
-
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input (may be a simple pointer type). |
- Parameters
-
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped |
( |
int |
linear_tid, |
|
|
InputIteratorT |
block_itr, |
|
|
InputT(&) |
items[ITEMS_PER_THREAD], |
|
|
int |
valid_items, |
|
|
DefaultT |
oob_default |
|
) |
| |
Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
Assumes a warp-striped arrangement of elements across threads, where warpi owns the ith range of (warp-threads*items-per-thread) contiguous items, and each thread owns items (i), (i + warp-threads), ..., (i + (warp-threads*(items-per-thread-1))).
- Usage Considerations
- The number of threads in the thread block must be a multiple of the architecture's warp size.
- Template Parameters
-
T | [inferred] The data type to load. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
InputIteratorT | [inferred] The random-access iterator type for input (may be a simple pointer type). |
- Parameters
-
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base input iterator for loading from |
[out] | items | Data to load |
[in] | valid_items | Number of valid items to load |
[in] | oob_default | Default value to assign out-of-bound items |
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectBlocked |
( |
int |
linear_tid, |
|
|
OutputIteratorT |
block_itr, |
|
|
T(&) |
items[ITEMS_PER_THREAD] |
|
) |
| |
Store a blocked arrangement of items across a thread block into a linear segment of items.
Assumes a blocked arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns the ith range of items-per-thread contiguous items. For multi-dimensional thread blocks, a row-major thread ordering is assumed.
- Template Parameters
-
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
OutputIteratorT | [inferred] The random-access iterator type for output (may be a simple pointer type). |
- Parameters
-
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectBlocked |
( |
int |
linear_tid, |
|
|
OutputIteratorT |
block_itr, |
|
|
T(&) |
items[ITEMS_PER_THREAD], |
|
|
int |
valid_items |
|
) |
| |
Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range.
Assumes a blocked arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns the ith range of items-per-thread contiguous items. For multi-dimensional thread blocks, a row-major thread ordering is assumed.
- Template Parameters
-
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
OutputIteratorT | [inferred] The random-access iterator type for output (may be a simple pointer type). |
- Parameters
-
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
[in] | valid_items | Number of valid items to write |
template<typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::StoreDirectBlockedVectorized |
( |
int |
linear_tid, |
|
|
T * |
block_ptr, |
|
|
T(&) |
items[ITEMS_PER_THREAD] |
|
) |
| |
Store a blocked arrangement of items across a thread block into a linear segment of items.
Assumes a blocked arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns the ith range of items-per-thread contiguous items. For multi-dimensional thread blocks, a row-major thread ordering is assumed.
The output offset (block_ptr
+ block_offset
) must be quad-item aligned, which is the default starting offset returned by cudaMalloc()
- The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
ITEMS_PER_THREAD
is odd
- The data type
T
is not a built-in primitive or CUDA vector type (e.g., short
, int2
, double
, float2
, etc.)
- Template Parameters
-
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
- Parameters
-
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_ptr | Input pointer for storing from |
[in] | items | Data to store |
template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectStriped |
( |
int |
linear_tid, |
|
|
OutputIteratorT |
block_itr, |
|
|
T(&) |
items[ITEMS_PER_THREAD] |
|
) |
| |
Store a striped arrangement of data across the thread block into a linear segment of items.
Assumes a striped arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns items (i), (i + block-threads), ..., (i + (block-threads*(items-per-thread-1))). For multi-dimensional thread blocks, a row-major thread ordering is assumed.
- Template Parameters
-
BLOCK_THREADS | The thread block size in threads |
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
OutputIteratorT | [inferred] The random-access iterator type for output (may be a simple pointer type). |
- Parameters
-
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
template<int BLOCK_THREADS, typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectStriped |
( |
int |
linear_tid, |
|
|
OutputIteratorT |
block_itr, |
|
|
T(&) |
items[ITEMS_PER_THREAD], |
|
|
int |
valid_items |
|
) |
| |
Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range.
Assumes a striped arrangement of (block-threads*items-per-thread) items across the thread block, where threadi owns items (i), (i + block-threads), ..., (i + (block-threads*(items-per-thread-1))). For multi-dimensional thread blocks, a row-major thread ordering is assumed.
- Template Parameters
-
BLOCK_THREADS | The thread block size in threads |
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
OutputIteratorT | [inferred] The random-access iterator type for output (may be a simple pointer type). |
- Parameters
-
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
[in] | valid_items | Number of valid items to write |
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectWarpStriped |
( |
int |
linear_tid, |
|
|
OutputIteratorT |
block_itr, |
|
|
T(&) |
items[ITEMS_PER_THREAD] |
|
) |
| |
Store a warp-striped arrangement of data across the thread block into a linear segment of items.
Assumes a warp-striped arrangement of elements across threads, where warpi owns the ith range of (warp-threads*items-per-thread) contiguous items, and each thread owns items (i), (i + warp-threads), ..., (i + (warp-threads*(items-per-thread-1))).
- Usage Considerations
- The number of threads in the thread block must be a multiple of the architecture's warp size.
- Template Parameters
-
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
OutputIteratorT | [inferred] The random-access iterator type for output (may be a simple pointer type). |
- Parameters
-
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base output iterator for storing to |
[out] | items | Data to load |
template<typename T , int ITEMS_PER_THREAD, typename OutputIteratorT >
__device__ __forceinline__ void cub::StoreDirectWarpStriped |
( |
int |
linear_tid, |
|
|
OutputIteratorT |
block_itr, |
|
|
T(&) |
items[ITEMS_PER_THREAD], |
|
|
int |
valid_items |
|
) |
| |
Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range.
Assumes a warp-striped arrangement of elements across threads, where warpi owns the ith range of (warp-threads*items-per-thread) contiguous items, and each thread owns items (i), (i + warp-threads), ..., (i + (warp-threads*(items-per-thread-1))).
- Usage Considerations
- The number of threads in the thread block must be a multiple of the architecture's warp size.
- Template Parameters
-
T | [inferred] The data type to store. |
ITEMS_PER_THREAD | [inferred] The number of consecutive items partitioned onto each thread. |
OutputIteratorT | [inferred] The random-access iterator type for output (may be a simple pointer type). |
- Parameters
-
[in] | linear_tid | A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) |
[in] | block_itr | The thread block's base output iterator for storing to |
[in] | items | Data to store |
[in] | valid_items | Number of valid items to write |