|
Fermat
|
An efficient warp-synchronous atomic adder, to add or subtract to a shared integer.
Given a pointer to an integer (e.g. representing a "pool"), this class allows the threads in a warp to add (allocate) or subtract (deallocate) a per-thread integer to it.
#include <warp_atomics.h>
Classes | |
| struct | temp_storage_type |
Public Methods | |
| __device__ __forceinline__ | warp_atomic (uint32 *dest, temp_storage_type &temp_storage) |
| __device__ __forceinline__ void | add (uint32 n) |
| __device__ __forceinline__ void | sub (uint32 n) |
| template<uint32 N> | |
| __device__ __forceinline__ void | add (bool p) |
| template<uint32 N> | |
| __device__ __forceinline__ void | sub (bool p) |
| __device__ __forceinline__ void | add (uint32 n, uint32 *result) |
| __device__ __forceinline__ void | sub (uint32 n, uint32 *result) |
| template<uint32 N> | |
| __device__ __forceinline__ void | add (bool p, uint32 *result) |
| template<uint32 N> | |
| __device__ __forceinline__ void | sub (bool p, uint32 *result) |
Static Public Methods | |
| __device__ static __forceinline__ void | add (uint32 n, uint32 *dest, temp_storage_type &temp_storage) |
| __device__ static __forceinline__ void | sub (uint32 n, uint32 *dest, temp_storage_type &temp_storage) |
| __device__ static __forceinline__ void | add (uint32 n, uint32 *dest, uint32 *result, temp_storage_type &temp_storage) |
| __device__ static __forceinline__ void | sub (uint32 n, uint32 *dest, uint32 *result, temp_storage_type &temp_storage) |
| template<uint32 N> | |
| __device__ static __forceinline__ void | static_add (bool p, uint32 *dest) |
| template<uint32 N> | |
| __device__ static __forceinline__ void | static_sub (bool p, uint32 *dest) |
| template<uint32 N> | |
| __device__ static __forceinline__ void | static_add (bool p, uint32 *dest, uint32 *result) |
| template<uint32 N> | |
| __device__ static __forceinline__ void | static_sub (bool p, uint32 *dest, uint32 *result) |
|
inline |
stateful object constructor
|
inline |
add a per-thread value to the shared integer without waiting for the result
| n | number of elements to alloc |
|
inline |
add zero or exactly N per thread to a shared value without waiting for the result: useful to alloc N entries from a common pool NOTE that this class internally uses a synchronous warp reduction, and as such it requires all threads to participate in the operation.
| p | allocation predicate |
|
inline |
add a per-thread value to the shared integer: useful to alloc entries from a common pool NOTE that this class internally uses a synchronous warp scan, and as such it requires all threads to participate in the operation.
| n | number of elements to alloc |
| result | output result |
|
inline |
add zero or exactly N per thread to a shared value: useful to alloc N entries from a common pool
| p | allocation predicate |
| result | output result |
|
inlinestatic |
add a per-thread value to the shared integer without waiting for the result NOTE that this class internally uses a synchronous warp reduction, and as such it requires all threads to participate in the operation.
| n | number of elements to alloc |
| dest | the destination of the atomic |
|
inlinestatic |
add a per-thread value to the shared integer: useful to alloc entries from a common pool NOTE that this class internally uses a synchronous warp scan, and as such it requires all threads to participate in the operation.
| n | number of elements to alloc |
| dest | the destination of the atomic |
| result | output result |
|
inlinestatic |
add zero or exactly N per thread to a shared value without waiting for the result: useful to dealloc N entries from a common pool
| p | allocation predicate |
| dest | the destination of the atomic |
|
inlinestatic |
add zero or exactly N per thread to a shared value: useful to dealloc N entries from a common pool
| p | allocation predicate |
| dest | the destination of the atomic |
| result | output result |
|
inlinestatic |
subtract zero or exactly N per thread to a shared value without waiting for the result: useful to dealloc N entries from a common pool
| p | allocation predicate |
| dest | the destination of the atomic |
|
inlinestatic |
subtract zero or exactly N per thread to a shared value: useful to dealloc N entries from a common pool
| p | allocation predicate |
| dest | the destination of the atomic |
| result | output result |
|
inline |
add a per-thread value to the shared integer without waiting for the result
| n | number of elements to alloc |
|
inline |
subtract zero or exactly N per thread to a shared value without waiting for the result: useful to dealloc N entries from a common pool NOTE that this class internally uses a synchronous warp reduction, and as such it requires all threads to participate in the operation.
| p | allocation predicate |
|
inline |
subtract a per-thread value to the shared integer: useful to dealloc entries from a common pool NOTE that this class internally uses a synchronous warp scan, and as such it requires all threads to participate in the operation.
| n | number of elements to alloc |
| result | output result |
|
inline |
subtract zero or exactly N per thread to a shared value: useful to dealloc N entries from a common pool
| p | allocation predicate |
| result | output result |
|
inlinestatic |
subtract a per-thread value to the shared integer without waiting for the result NOTE that this class internally uses a synchronous warp reduction, and as such it requires all threads to participate in the operation.
| n | number of elements to alloc |
| dest | the destination of the atomic |
|
inlinestatic |
subtract a per-thread value to the shared integer: useful to dealloc entries from a common pool NOTE that this class internally uses a synchronous warp scan, and as such it requires all threads to participate in the operation.
| n | number of elements to alloc |
| dest | the destination of the atomic |
| result | output result |
1.8.13