template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void BlockSortKernel(int *d_in, int *d_out)
{
int, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
__shared__ union {
typename BlockLoadT::TempStorage load;
typename BlockStoreT::TempStorage store;
typename BlockRadixSortT::TempStorage sort;
} temp_storage;
int thread_keys[ITEMS_PER_THREAD];
int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);
BlockLoadT(temp_storage.load).Load(d_in + block_offset, thread_keys);
__syncthreads();
BlockRadixSortT(temp_storage.sort).Sort(thread_keys);
__syncthreads();
BlockStoreT(temp_storage.store).Store(d_out + block_offset, thread_keys);
}