CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
Classes | Namespaces | Enumerations
block_load.cuh File Reference
#include <iterator>
#include "block_exchange.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"
#include "../util_ptx.cuh"
#include "../util_macro.cuh"
#include "../util_type.cuh"
#include "../util_namespace.cuh"

Classes

class  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >
 The BlockLoad class provides collective data movement methods for loading a linear segment of items from memory into a blocked arrangement across a CUDA thread block.

block_load_logo.png
.
More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::_TempStorage
 Shared memory storage layout type. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_TRANSPOSE, DUMMY >::TempStorage
 Alias wrapper allowing storage to be unioned. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::_TempStorage
 Shared memory storage layout type. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE, DUMMY >::TempStorage
 Alias wrapper allowing storage to be unioned. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::_TempStorage
 Shared memory storage layout type. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::LoadInternal< BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY >::TempStorage
 Alias wrapper allowing storage to be unioned. More...
 
struct  cub::BlockLoad< InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH >::TempStorage
 The operations exposed by BlockLoad require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse. More...
 

Namespaces

 cub
 Optional outer namespace(s)
 

Enumerations

enum  cub::BlockLoadAlgorithm {
  cub::BLOCK_LOAD_DIRECT, cub::BLOCK_LOAD_VECTORIZE, cub::BLOCK_LOAD_TRANSPOSE, cub::BLOCK_LOAD_WARP_TRANSPOSE,
  cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED
}
 

Functions

Blocked arrangement I/O (direct)
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a blocked arrangement across the thread block. More...
 
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items into a blocked arrangement across the thread block, guarded by range. More...
 
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectBlocked (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.. More...
 
template<typename T , int ITEMS_PER_THREAD>
__device__ __forceinline__ void cub::LoadDirectBlockedVectorized (int linear_tid, T *block_ptr, T(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a blocked arrangement across the thread block. More...
 
Striped arrangement I/O (direct)
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a striped arrangement across the thread block. More...
 
template<int BLOCK_THREADS, typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items into a striped arrangement across the thread block, guarded by range. More...
 
template<int BLOCK_THREADS, typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. More...
 
Warp-striped arrangement I/O (direct)
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD])
 Load a linear segment of items into a warp-striped arrangement across the thread block. More...
 
template<typename InputT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items)
 Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range. More...
 
template<typename InputT , typename DefaultT , int ITEMS_PER_THREAD, typename InputIteratorT >
__device__ __forceinline__ void cub::LoadDirectWarpStriped (int linear_tid, InputIteratorT block_itr, InputT(&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
 Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. More...
 

Detailed Description

Operations for reading linear tiles of data into the CUDA thread block.