template<
typename KeyT,
int BLOCK_DIM_X,
int ITEMS_PER_THREAD,
typename ValueT = NullType,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1>
class BlockMergeSort< KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, BLOCK_DIM_Y, BLOCK_DIM_Z >
The BlockMergeSort class provides methods for sorting items partitioned across a CUDA thread block using a merge sorting method.
- Template Parameters
-
KeyT | KeyT type |
BLOCK_DIM_X | The thread block length in threads along the X dimension |
ITEMS_PER_THREAD | The number of items per thread |
ValueT | **[optional]** ValueT type (default: cub::NullType , which indicates a keys-only sort) |
BLOCK_DIM_Y | **[optional]** The thread block length in threads along the Y dimension (default: 1) |
BLOCK_DIM_Z | **[optional]** The thread block length in threads along the Z dimension (default: 1) |
- Overview
- BlockMergeSort arranges items into ascending order using a comparison functor with less-than semantics. Merge sort can handle arbitrary types and comparison functors, but is slower than BlockRadixSort when sorting arithmetic types into ascending/descending order.
- A Simple Example
- Every thread in the block uses the BlockMergeSort class by first specializing the BlockMergeSort type, then instantiating an instance with parameters for communication, and finally invoking one or more collective member functions.
- The code snippet below illustrates a sort of 512 integer keys that are partitioned across 128 threads * where each thread owns 4 consecutive items.
struct CustomLess
{
template <typename DataType>
__device__ bool operator()(const DataType &lhs, const DataType &rhs)
{
return lhs < rhs;
}
};
__global__ void ExampleKernel(...)
{
__shared__ typename BlockMergeSort::TempStorage temp_storage_shuffle;
int thread_keys[4];
...
...
}
- Suppose the set of input
thread_keys
across the block of threads is { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }
. The corresponding output thread_keys
in those threads will be { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }
.
- Re-using dynamically allocating shared memory
- The following example under the examples/block folder illustrates usage of dynamically shared memory with BlockReduce and how to re-purpose the same memory region: example_block_reduce_dyn_smem.cu
This example can be easily adapted to the storage required by BlockMergeSort.
|
__device__ __forceinline__ | BlockMergeSort () |
|
__device__ __forceinline__ | BlockMergeSort (typename BlockMergeSortStrategyT::TempStorage &temp_storage) |
|
| BlockMergeSortStrategy ()=delete |
|
__device__ __forceinline__ | BlockMergeSortStrategy (unsigned int linear_tid) |
|
__device__ __forceinline__ | BlockMergeSortStrategy (TempStorage &temp_storage, unsigned int linear_tid) |
|
__device__ __forceinline__
unsigned int | get_linear_tid () const |
|
__device__ __forceinline__ void | Sort (KeyT(&keys)[ITEMS_PER_THREAD], CompareOp compare_op) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
__device__ __forceinline__ void | Sort (KeyT(&keys)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
__device__ __forceinline__ void | Sort (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&items)[ITEMS_PER_THREAD], CompareOp compare_op) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
__device__ __forceinline__ void | Sort (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&items)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
__device__ __forceinline__ void | StableSort (KeyT(&keys)[ITEMS_PER_THREAD], CompareOp compare_op) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
__device__ __forceinline__ void | StableSort (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&items)[ITEMS_PER_THREAD], CompareOp compare_op) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
__device__ __forceinline__ void | StableSort (KeyT(&keys)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|
__device__ __forceinline__ void | StableSort (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&items)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) |
| Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
|
|