CUB  
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Public Methods | List of all members
WarpMergeSort< KeyT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, ValueT, PTX_ARCH > Class Template Reference

Detailed description

template< typename KeyT, int ITEMS_PER_THREAD, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, typename ValueT = NullType, int PTX_ARCH = CUB_PTX_ARCH>
class WarpMergeSort< KeyT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, ValueT, PTX_ARCH >

The WarpMergeSort class provides methods for sorting items partitioned across a CUDA warp using a merge sorting method.

Template Parameters
KeyTKey type
ITEMS_PER_THREADThe number of items per thread
LOGICAL_WARP_THREADS[optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a power of two.
ValueT[optional] Value type (default: cub::NullType, which indicates a keys-only sort)
PTX_ARCH[optional] The PTX compute capability for which to to specialize this collective, formatted as per the CUDA_ARCH macro (e.g., 350 for sm_35). Useful for determining the collective's storage requirements for a given device from the host. (Default: the value of CUDA_ARCH during the current compiler pass)
Overview
WarpMergeSort arranges items into ascending order using a comparison functor with less-than semantics. Merge sort can handle arbitrary types and comparison functors.
A Simple Example
The code snippet below illustrates a sort of 64 integer keys that are partitioned across 16 threads where each thread owns 4 consecutive items.
#include <cub/cub.cuh> // or equivalently <cub/warp/warp_merge_sort.cuh>
struct CustomLess
{
template <typename DataType>
__device__ bool operator()(const DataType &lhs, const DataType &rhs)
{
return lhs < rhs;
}
};
__global__ void ExampleKernel(...)
{
constexpr int warp_threads = 16;
constexpr int block_threads = 256;
constexpr int items_per_thread = 4;
constexpr int warps_per_block = block_threads / warp_threads;
const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
// Specialize WarpMergeSort for a virtual warp of 16 threads
// owning 4 integer items each
using WarpMergeSortT =
cub::WarpMergeSort<int, items_per_thread, warp_threads>;
// Allocate shared memory for WarpMergeSort
__shared__ typename WarpMergeSort::TempStorage temp_storage[warps_per_block];
// Obtain a segment of consecutive items that are blocked across threads
int thread_keys[items_per_thread];
// ...
WarpMergeSort(temp_storage[warp_id]).Sort(thread_keys, CustomLess());
// ...
}
Suppose the set of input thread_keys across the block of threads is { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The corresponding output thread_keys in those threads will be { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }.
Inheritance diagram for WarpMergeSort< KeyT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, ValueT, PTX_ARCH >:
BlockMergeSortStrategy< KeyT, ValueT, LOGICAL_WARP_THREADS, ITEMS_PER_THREAD, WarpMergeSort< KeyT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, ValueT, PTX_ARCH > >

Public Methods

 WarpMergeSort ()=delete
 
__device__ __forceinline__ WarpMergeSort (typename BlockMergeSortStrategyT::TempStorage &temp_storage)
 
__device__ __forceinline__
unsigned int 
get_member_mask () const
 
- Public Methods inherited from BlockMergeSortStrategy< KeyT, ValueT, LOGICAL_WARP_THREADS, ITEMS_PER_THREAD, WarpMergeSort< KeyT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, ValueT, PTX_ARCH > >
 BlockMergeSortStrategy ()=delete
 
__device__ __forceinline__ BlockMergeSortStrategy (unsigned int linear_tid)
 
__device__ __forceinline__ BlockMergeSortStrategy (TempStorage &temp_storage, unsigned int linear_tid)
 
__device__ __forceinline__
unsigned int 
get_linear_tid () const
 
__device__ __forceinline__ void Sort (KeyT(&keys)[ITEMS_PER_THREAD], CompareOp compare_op)
 Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
 
__device__ __forceinline__ void Sort (KeyT(&keys)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default)
 Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
 
__device__ __forceinline__ void Sort (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&items)[ITEMS_PER_THREAD], CompareOp compare_op)
 Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
 
__device__ __forceinline__ void Sort (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&items)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default)
 Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
 
__device__ __forceinline__ void StableSort (KeyT(&keys)[ITEMS_PER_THREAD], CompareOp compare_op)
 Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
 
__device__ __forceinline__ void StableSort (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&items)[ITEMS_PER_THREAD], CompareOp compare_op)
 Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
 
__device__ __forceinline__ void StableSort (KeyT(&keys)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default)
 Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
 
__device__ __forceinline__ void StableSort (KeyT(&keys)[ITEMS_PER_THREAD], ValueT(&items)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default)
 Sorts items partitioned across a CUDA thread block using a merge sorting method. More...
 

Constructor & Destructor Documentation

template<typename KeyT , int ITEMS_PER_THREAD, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, typename ValueT = NullType, int PTX_ARCH = CUB_PTX_ARCH>
WarpMergeSort< KeyT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, ValueT, PTX_ARCH >::WarpMergeSort ( )
delete
template<typename KeyT , int ITEMS_PER_THREAD, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, typename ValueT = NullType, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ WarpMergeSort< KeyT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, ValueT, PTX_ARCH >::WarpMergeSort ( typename BlockMergeSortStrategyT::TempStorage temp_storage)
inline

Member Function Documentation

template<typename KeyT , int ITEMS_PER_THREAD, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, typename ValueT = NullType, int PTX_ARCH = CUB_PTX_ARCH>
__device__ __forceinline__ unsigned int WarpMergeSort< KeyT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, ValueT, PTX_ARCH >::get_member_mask ( ) const
inline

The documentation for this class was generated from the following file: