CUB
|
#include <iterator>
#include <type_traits>
#include <cub/block/block_load.cuh>
#include <cub/config.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>
#include <cub/warp/warp_exchange.cuh>
Classes | |
class | WarpLoad< InputT, ITEMS_PER_THREAD, ALGORITHM, LOGICAL_WARP_THREADS, PTX_ARCH > |
The WarpLoad class provides collective data movement methods for loading a linear segment of items from memory into a blocked arrangement across a CUDA thread block. More... | |
struct | WarpLoad< InputT, ITEMS_PER_THREAD, ALGORITHM, LOGICAL_WARP_THREADS, PTX_ARCH >::LoadInternal< WARP_LOAD_TRANSPOSE, DUMMY >::_TempStorage |
struct | WarpLoad< InputT, ITEMS_PER_THREAD, ALGORITHM, LOGICAL_WARP_THREADS, PTX_ARCH >::LoadInternal< WARP_LOAD_TRANSPOSE, DUMMY >::TempStorage |
struct | WarpLoad< InputT, ITEMS_PER_THREAD, ALGORITHM, LOGICAL_WARP_THREADS, PTX_ARCH >::TempStorage |
The operations exposed by WarpLoad require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union 'd with other storage allocation types to facilitate memory reuse. More... | |
Enumerations | |
enum | WarpLoadAlgorithm { WARP_LOAD_DIRECT, WARP_LOAD_STRIPED, WARP_LOAD_VECTORIZE, WARP_LOAD_TRANSPOSE } |
cub::WarpLoadAlgorithm enumerates alternative algorithms for cub::WarpLoad to read a linear segment of data from memory into a a CUDA warp. More... | |
Operations for reading linear tiles of data into the CUDA warp.
enum WarpLoadAlgorithm |
cub::WarpLoadAlgorithm enumerates alternative algorithms for cub::WarpLoad to read a linear segment of data from memory into a a CUDA warp.
Enumerator | |
---|---|
WARP_LOAD_DIRECT |
A blocked arrangement of data is read directly from memory.
|
WARP_LOAD_STRIPED |
A striped arrangement of data is read directly from memory.
|
WARP_LOAD_VECTORIZE |
A blocked arrangement of data is read from memory using CUDA's built-in vectorized loads as a coalescing optimization. For example,
|
WARP_LOAD_TRANSPOSE |
A striped arrangement of data is read efficiently from memory and then locally transposed into a blocked arrangement.
|