template<
typename KeyT,
int ITEMS_PER_THREAD,
int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
typename ValueT = NullType,
int PTX_ARCH = CUB_PTX_ARCH>
class WarpMergeSort< KeyT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, ValueT, PTX_ARCH >
The WarpMergeSort class provides methods for sorting items partitioned across a CUDA warp using a merge sorting method.
- Template Parameters
-
KeyT | Key type |
ITEMS_PER_THREAD | The number of items per thread |
LOGICAL_WARP_THREADS | [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a power of two. |
ValueT | [optional] Value type (default: cub::NullType, which indicates a keys-only sort) |
PTX_ARCH | [optional] The PTX compute capability for which to to specialize this collective, formatted as per the CUDA_ARCH macro (e.g., 350 for sm_35). Useful for determining the collective's storage requirements for a given device from the host. (Default: the value of CUDA_ARCH during the current compiler pass) |
- Overview
- WarpMergeSort arranges items into ascending order using a comparison functor with less-than semantics. Merge sort can handle arbitrary types and comparison functors.
- A Simple Example
- The code snippet below illustrates a sort of 64 integer keys that are partitioned across 16 threads where each thread owns 4 consecutive items.
struct CustomLess
{
template <typename DataType>
__device__ bool operator()(const DataType &lhs, const DataType &rhs)
{
return lhs < rhs;
}
};
__global__ void ExampleKernel(...)
{
constexpr int warp_threads = 16;
constexpr int block_threads = 256;
constexpr int items_per_thread = 4;
constexpr int warps_per_block = block_threads / warp_threads;
const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
using WarpMergeSortT =
cub::WarpMergeSort<int, items_per_thread, warp_threads>;
__shared__ typename WarpMergeSort::TempStorage temp_storage[warps_per_block];
int thread_keys[items_per_thread];
WarpMergeSort(temp_storage[warp_id]).Sort(thread_keys, CustomLess());
}
- Suppose the set of input
thread_keys
across the block of threads is { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }
. The corresponding output thread_keys
in those threads will be { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }
.
|
| WarpMergeSort ()=delete |
|
__device__ __forceinline__ | WarpMergeSort (typename BlockMergeSortStrategyT::TempStorage &temp_storage) |
|
__device__ __forceinline__
unsigned int | get_member_mask () const |
|
| BlockMergeSortStrategy ()=delete |
|
__device__ __forceinline__ | BlockMergeSortStrategy (unsigned int linear_tid) |
|
__device__ __forceinline__ | BlockMergeSortStrategy (TempStorage &temp_storage, unsigned int linear_tid) |
|
__device__ __forceinline__
unsigned int | get_linear_tid () const |
|
__device__ __forceinline__ void | Sort (KeyT(&keys)[ITEMS_PER_THREAD], CompareOp compare_op) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
__device__ __forceinline__ void | Sort (KeyT(&keys)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
__device__ __forceinline__ void | Sort (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&items)[ITEMS_PER_THREAD], CompareOp compare_op) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
__device__ __forceinline__ void | Sort (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&items)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
__device__ __forceinline__ void | StableSort (KeyT(&keys)[ITEMS_PER_THREAD], CompareOp compare_op) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
__device__ __forceinline__ void | StableSort (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&items)[ITEMS_PER_THREAD], CompareOp compare_op) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
__device__ __forceinline__ void | StableSort (KeyT(&keys)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
__device__ __forceinline__ void | StableSort (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&items)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|