CUB
|
BlockAdjacentDifference provides collective methods for computing the differences of adjacent elements partitioned across a CUDA thread block.
i - 1
of input sequence from current element i
. Methods named SubtractRight subtract current element i
from the right one i + 1
: input[0]
without modification.BlockAdjacentDifference
to compute the left difference between adjacent elements.thread_data
across the block of threads is { [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }
. The corresponding output result
in those threads will be { [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }
. Classes | |
struct | TempStorage |
The operations exposed by BlockDiscontinuity require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union 'd with other storage allocation types to facilitate memory reuse. More... | |
Public Methods | |
Collective constructors | |
__device__ __forceinline__ | BlockAdjacentDifference () |
Collective constructor using a private static allocation of shared memory as temporary storage. More... | |
__device__ __forceinline__ | BlockAdjacentDifference (TempStorage &temp_storage) |
Collective constructor using the specified memory allocation as temporary storage. More... | |
Read left operations | |
template<int ITEMS_PER_THREAD, typename OutputType , typename DifferenceOpT > | |
__device__ __forceinline__ void | SubtractLeft (T(&input)[ITEMS_PER_THREAD], OutputType(&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op) |
Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block. More... | |
template<int ITEMS_PER_THREAD, typename OutputT , typename DifferenceOpT > | |
__device__ __forceinline__ void | SubtractLeft (T(&input)[ITEMS_PER_THREAD], OutputT(&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op, T tile_predecessor_item) |
Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block. More... | |
template<int ITEMS_PER_THREAD, typename OutputType , typename DifferenceOpT > | |
__device__ __forceinline__ void | SubtractLeftPartialTile (T(&input)[ITEMS_PER_THREAD], OutputType(&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op, int valid_items) |
Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block. More... | |
Read right operations | |
template<int ITEMS_PER_THREAD, typename OutputT , typename DifferenceOpT > | |
__device__ __forceinline__ void | SubtractRight (T(&input)[ITEMS_PER_THREAD], OutputT(&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op) |
Subtracts the right element of each adjacent pair of elements partitioned across a CUDA thread block. More... | |
template<int ITEMS_PER_THREAD, typename OutputT , typename DifferenceOpT > | |
__device__ __forceinline__ void | SubtractRight (T(&input)[ITEMS_PER_THREAD], OutputT(&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op, T tile_successor_item) |
Subtracts the right element of each adjacent pair of elements partitioned across a CUDA thread block. More... | |
template<int ITEMS_PER_THREAD, typename OutputT , typename DifferenceOpT > | |
__device__ __forceinline__ void | SubtractRightPartialTile (T(&input)[ITEMS_PER_THREAD], OutputT(&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op, int valid_items) |
Subtracts the right element of each adjacent pair in range of elements partitioned across a CUDA thread block. More... | |
Head flag operations (deprecated) | |
template<int ITEMS_PER_THREAD, typename FlagT , typename FlagOp > | |
__device__ __forceinline__ void | FlagHeads (FlagT(&output)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op) |
template<int ITEMS_PER_THREAD, typename FlagT , typename FlagOp > | |
__device__ __forceinline__ void | FlagHeads (FlagT(&output)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_predecessor_item) |
template<int ITEMS_PER_THREAD, typename FlagT , typename FlagOp > | |
__device__ __forceinline__ void | FlagTails (FlagT(&output)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op) |
template<int ITEMS_PER_THREAD, typename FlagT , typename FlagOp > | |
__device__ __forceinline__ void | FlagTails (FlagT(&output)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_successor_item) |
template<int ITEMS_PER_THREAD, typename FlagT , typename FlagOp > | |
__device__ __forceinline__ void | FlagHeadsAndTails (FlagT(&head_flags)[ITEMS_PER_THREAD], FlagT(&tail_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op) |
template<int ITEMS_PER_THREAD, typename FlagT , typename FlagOp > | |
__device__ __forceinline__ void | FlagHeadsAndTails (FlagT(&head_flags)[ITEMS_PER_THREAD], FlagT(&tail_flags)[ITEMS_PER_THREAD], T tile_successor_item, T(&input)[ITEMS_PER_THREAD], FlagOp flag_op) |
template<int ITEMS_PER_THREAD, typename FlagT , typename FlagOp > | |
__device__ __forceinline__ void | FlagHeadsAndTails (FlagT(&head_flags)[ITEMS_PER_THREAD], T tile_predecessor_item, FlagT(&tail_flags)[ITEMS_PER_THREAD], T(&input)[ITEMS_PER_THREAD], FlagOp flag_op) |
template<int ITEMS_PER_THREAD, typename FlagT , typename FlagOp > | |
__device__ __forceinline__ void | FlagHeadsAndTails (FlagT(&head_flags)[ITEMS_PER_THREAD], T tile_predecessor_item, FlagT(&tail_flags)[ITEMS_PER_THREAD], T tile_successor_item, T(&input)[ITEMS_PER_THREAD], FlagOp flag_op) |
|
inline |
Collective constructor using a private static allocation of shared memory as temporary storage.
|
inline |
Collective constructor using the specified memory allocation as temporary storage.
[in] | temp_storage | Reference to memory allocation having layout type TempStorage |
|
inline |
Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
__syncthreads()
threadblock barrier should be invoked after calling this method if the collective's temporary storage (e.g., temp_storage
) is to be reused or repurposed.BlockAdjacentDifference
to compute the left difference between adjacent elements.thread_data
across the block of threads is { [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }
. The corresponding output result
in those threads will be { [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }
.[out] | output | Calling thread's adjacent difference result |
[in] | input | Calling thread's input items (may be aliased to output ) |
[in] | difference_op | Binary difference operator |
|
inline |
Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
__syncthreads()
threadblock barrier should be invoked after calling this method if the collective's temporary storage (e.g., temp_storage
) is to be reused or repurposed.BlockAdjacentDifference
to compute the left difference between adjacent elements.thread_data
across the block of threads is { [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }
. and that tile_predecessor_item
is 3
. The corresponding output result
in those threads will be { [1,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }
.[out] | output | Calling thread's adjacent difference result |
[in] | input | Calling thread's input items (may be aliased to output ) |
[in] | difference_op | Binary difference operator |
[in] | tile_predecessor_item | [thread0 only] item which is going to be subtracted from the first tile item (input0 from thread0). |
|
inline |
Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
__syncthreads()
threadblock barrier should be invoked after calling this method if the collective's temporary storage (e.g., temp_storage
) is to be reused or repurposed.BlockAdjacentDifference
to compute the left difference between adjacent elements.thread_data
across the block of threads is { [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }
. The corresponding output result
in those threads will be { [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }
.[out] | output | Calling thread's adjacent difference result |
[in] | input | Calling thread's input items (may be aliased to output ) |
[in] | difference_op | Binary difference operator |
[in] | valid_items | Number of valid items in thread block |
|
inline |
Subtracts the right element of each adjacent pair of elements partitioned across a CUDA thread block.
__syncthreads()
threadblock barrier should be invoked after calling this method if the collective's temporary storage (e.g., temp_storage
) is to be reused or repurposed.BlockAdjacentDifference
to compute the right difference between adjacent elements.thread_data
across the block of threads is { ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }
. The corresponding output result
in those threads will be { ..., [-1,2,1,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,4] }
.[out] | output | Calling thread's adjacent difference result |
[in] | input | Calling thread's input items (may be aliased to output ) |
[in] | difference_op | Binary difference operator |
|
inline |
Subtracts the right element of each adjacent pair of elements partitioned across a CUDA thread block.
__syncthreads()
threadblock barrier should be invoked after calling this method if the collective's temporary storage (e.g., temp_storage
) is to be reused or repurposed.BlockAdjacentDifference
to compute the right difference between adjacent elements.thread_data
across the block of threads is { ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }
, and that tile_successor_item
is 3
. The corresponding output result
in those threads will be { ..., [-1,2,1,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,1] }
.[out] | output | Calling thread's adjacent difference result |
[in] | input | Calling thread's input items (may be aliased to output ) |
[in] | difference_op | Binary difference operator |
[in] | tile_successor_item | [threadBLOCK_THREADS -1 only] item which is going to be subtracted from the last tile item (input ITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). |
|
inline |
Subtracts the right element of each adjacent pair in range of elements partitioned across a CUDA thread block.
__syncthreads()
threadblock barrier should be invoked after calling this method if the collective's temporary storage (e.g., temp_storage
) is to be reused or repurposed.BlockAdjacentDifference
to compute the right difference between adjacent elements.thread_data
across the block of threads is { ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }
. and that valid_items
is 507
. The corresponding output result
in those threads will be { ..., [-1,2,1,0], [0,0,0,-1], [-1,0,3,3], [3,4,1,4] }
.[out] | output | Calling thread's adjacent difference result |
[in] | input | Calling thread's input items (may be aliased to output ) |
[in] | difference_op | Binary difference operator |
[in] | valid_items | Number of valid items in thread block |
|
inline |
[out] | output | Calling thread's discontinuity result |
[in] | input | Calling thread's input items |
[in] | flag_op | Binary boolean flag predicate |
|
inline |
[out] | output | Calling thread's discontinuity result |
[in] | input | Calling thread's input items |
[in] | flag_op | Binary boolean flag predicate |
[in] | tile_predecessor_item | [thread0 only] Item with which to compare the first tile item (input0 from thread0). |
|
inline |
[out] | output | Calling thread's discontinuity result |
[in] | input | Calling thread's input items |
[in] | flag_op | Binary boolean flag predicate |
|
inline |
[out] | output | Calling thread's discontinuity result |
[in] | input | Calling thread's input items |
[in] | flag_op | Binary boolean flag predicate |
[in] | tile_successor_item | [threadBLOCK_THREADS -1 only] Item with which to compare the last tile item (input ITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). |
|
inline |
[out] | head_flags | Calling thread's discontinuity head_flags |
[out] | tail_flags | Calling thread's discontinuity tail_flags |
[in] | input | Calling thread's input items |
[in] | flag_op | Binary boolean flag predicate |
|
inline |
[out] | head_flags | Calling thread's discontinuity head_flags |
[out] | tail_flags | Calling thread's discontinuity tail_flags |
[in] | tile_successor_item | [threadBLOCK_THREADS -1 only] Item with which to compare the last tile item (input ITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). |
[in] | input | Calling thread's input items |
[in] | flag_op | Binary boolean flag predicate |
|
inline |
[out] | head_flags | Calling thread's discontinuity head_flags |
[in] | tile_predecessor_item | [thread0 only] Item with which to compare the first tile item (input0 from thread0). |
[out] | tail_flags | Calling thread's discontinuity tail_flags |
[in] | input | Calling thread's input items |
[in] | flag_op | Binary boolean flag predicate |
|
inline |
[out] | head_flags | Calling thread's discontinuity head_flags |
[in] | tile_predecessor_item | [thread0 only] Item with which to compare the first tile item (input0 from thread0). |
[out] | tail_flags | Calling thread's discontinuity tail_flags |
[in] | tile_successor_item | [threadBLOCK_THREADS -1 only] Item with which to compare the last tile item (input ITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). |
[in] | input | Calling thread's input items |
[in] | flag_op | Binary boolean flag predicate |