Fermat
Classes | Public Methods | Static Public Methods | List of all members
cugar::cuda::warp_atomic Struct Reference

Detailed description

An efficient warp-synchronous atomic adder, to add or subtract to a shared integer.

Given a pointer to an integer (e.g. representing a "pool"), this class allows the threads in a warp to add (allocate) or subtract (deallocate) a per-thread integer to it.

#include <warp_atomics.h>

Classes

struct  temp_storage_type
 

Public Methods

__device__ __forceinline__ warp_atomic (uint32 *dest, temp_storage_type &temp_storage)
 
__device__ __forceinline__ void add (uint32 n)
 
__device__ __forceinline__ void sub (uint32 n)
 
template<uint32 N>
__device__ __forceinline__ void add (bool p)
 
template<uint32 N>
__device__ __forceinline__ void sub (bool p)
 
__device__ __forceinline__ void add (uint32 n, uint32 *result)
 
__device__ __forceinline__ void sub (uint32 n, uint32 *result)
 
template<uint32 N>
__device__ __forceinline__ void add (bool p, uint32 *result)
 
template<uint32 N>
__device__ __forceinline__ void sub (bool p, uint32 *result)
 

Static Public Methods

__device__ static __forceinline__ void add (uint32 n, uint32 *dest, temp_storage_type &temp_storage)
 
__device__ static __forceinline__ void sub (uint32 n, uint32 *dest, temp_storage_type &temp_storage)
 
__device__ static __forceinline__ void add (uint32 n, uint32 *dest, uint32 *result, temp_storage_type &temp_storage)
 
__device__ static __forceinline__ void sub (uint32 n, uint32 *dest, uint32 *result, temp_storage_type &temp_storage)
 
template<uint32 N>
__device__ static __forceinline__ void static_add (bool p, uint32 *dest)
 
template<uint32 N>
__device__ static __forceinline__ void static_sub (bool p, uint32 *dest)
 
template<uint32 N>
__device__ static __forceinline__ void static_add (bool p, uint32 *dest, uint32 *result)
 
template<uint32 N>
__device__ static __forceinline__ void static_sub (bool p, uint32 *dest, uint32 *result)
 

Constructor & Destructor Documentation

◆ warp_atomic()

__device__ __forceinline__ cugar::cuda::warp_atomic::warp_atomic ( uint32 *  dest,
temp_storage_type temp_storage 
)
inline

stateful object constructor

Member Function Documentation

◆ add() [1/6]

__device__ __forceinline__ void cugar::cuda::warp_atomic::add ( uint32  n)
inline

add a per-thread value to the shared integer without waiting for the result

Parameters
nnumber of elements to alloc

◆ add() [2/6]

template<uint32 N>
__device__ __forceinline__ void cugar::cuda::warp_atomic::add ( bool  p)
inline

add zero or exactly N per thread to a shared value without waiting for the result: useful to alloc N entries from a common pool NOTE that this class internally uses a synchronous warp reduction, and as such it requires all threads to participate in the operation.

Parameters
pallocation predicate

◆ add() [3/6]

__device__ __forceinline__ void cugar::cuda::warp_atomic::add ( uint32  n,
uint32 *  result 
)
inline

add a per-thread value to the shared integer: useful to alloc entries from a common pool NOTE that this class internally uses a synchronous warp scan, and as such it requires all threads to participate in the operation.

Parameters
nnumber of elements to alloc
resultoutput result

◆ add() [4/6]

template<uint32 N>
__device__ __forceinline__ void cugar::cuda::warp_atomic::add ( bool  p,
uint32 *  result 
)
inline

add zero or exactly N per thread to a shared value: useful to alloc N entries from a common pool

Parameters
pallocation predicate
resultoutput result

◆ add() [5/6]

__device__ static __forceinline__ void cugar::cuda::warp_atomic::add ( uint32  n,
uint32 *  dest,
temp_storage_type temp_storage 
)
inlinestatic

add a per-thread value to the shared integer without waiting for the result NOTE that this class internally uses a synchronous warp reduction, and as such it requires all threads to participate in the operation.

Parameters
nnumber of elements to alloc
destthe destination of the atomic

◆ add() [6/6]

__device__ static __forceinline__ void cugar::cuda::warp_atomic::add ( uint32  n,
uint32 *  dest,
uint32 *  result,
temp_storage_type temp_storage 
)
inlinestatic

add a per-thread value to the shared integer: useful to alloc entries from a common pool NOTE that this class internally uses a synchronous warp scan, and as such it requires all threads to participate in the operation.

Parameters
nnumber of elements to alloc
destthe destination of the atomic
resultoutput result

◆ static_add() [1/2]

template<uint32 N>
__device__ static __forceinline__ void cugar::cuda::warp_atomic::static_add ( bool  p,
uint32 *  dest 
)
inlinestatic

add zero or exactly N per thread to a shared value without waiting for the result: useful to dealloc N entries from a common pool

Parameters
pallocation predicate
destthe destination of the atomic

◆ static_add() [2/2]

template<uint32 N>
__device__ static __forceinline__ void cugar::cuda::warp_atomic::static_add ( bool  p,
uint32 *  dest,
uint32 *  result 
)
inlinestatic

add zero or exactly N per thread to a shared value: useful to dealloc N entries from a common pool

Parameters
pallocation predicate
destthe destination of the atomic
resultoutput result

◆ static_sub() [1/2]

template<uint32 N>
__device__ static __forceinline__ void cugar::cuda::warp_atomic::static_sub ( bool  p,
uint32 *  dest 
)
inlinestatic

subtract zero or exactly N per thread to a shared value without waiting for the result: useful to dealloc N entries from a common pool

Parameters
pallocation predicate
destthe destination of the atomic

◆ static_sub() [2/2]

template<uint32 N>
__device__ static __forceinline__ void cugar::cuda::warp_atomic::static_sub ( bool  p,
uint32 *  dest,
uint32 *  result 
)
inlinestatic

subtract zero or exactly N per thread to a shared value: useful to dealloc N entries from a common pool

Parameters
pallocation predicate
destthe destination of the atomic
resultoutput result

◆ sub() [1/6]

__device__ __forceinline__ void cugar::cuda::warp_atomic::sub ( uint32  n)
inline

add a per-thread value to the shared integer without waiting for the result

Parameters
nnumber of elements to alloc

◆ sub() [2/6]

template<uint32 N>
__device__ __forceinline__ void cugar::cuda::warp_atomic::sub ( bool  p)
inline

subtract zero or exactly N per thread to a shared value without waiting for the result: useful to dealloc N entries from a common pool NOTE that this class internally uses a synchronous warp reduction, and as such it requires all threads to participate in the operation.

Parameters
pallocation predicate

◆ sub() [3/6]

__device__ __forceinline__ void cugar::cuda::warp_atomic::sub ( uint32  n,
uint32 *  result 
)
inline

subtract a per-thread value to the shared integer: useful to dealloc entries from a common pool NOTE that this class internally uses a synchronous warp scan, and as such it requires all threads to participate in the operation.

Parameters
nnumber of elements to alloc
resultoutput result

◆ sub() [4/6]

template<uint32 N>
__device__ __forceinline__ void cugar::cuda::warp_atomic::sub ( bool  p,
uint32 *  result 
)
inline

subtract zero or exactly N per thread to a shared value: useful to dealloc N entries from a common pool

Parameters
pallocation predicate
resultoutput result

◆ sub() [5/6]

__device__ static __forceinline__ void cugar::cuda::warp_atomic::sub ( uint32  n,
uint32 *  dest,
temp_storage_type temp_storage 
)
inlinestatic

subtract a per-thread value to the shared integer without waiting for the result NOTE that this class internally uses a synchronous warp reduction, and as such it requires all threads to participate in the operation.

Parameters
nnumber of elements to alloc
destthe destination of the atomic

◆ sub() [6/6]

__device__ static __forceinline__ void cugar::cuda::warp_atomic::sub ( uint32  n,
uint32 *  dest,
uint32 *  result,
temp_storage_type temp_storage 
)
inlinestatic

subtract a per-thread value to the shared integer: useful to dealloc entries from a common pool NOTE that this class internally uses a synchronous warp scan, and as such it requires all threads to participate in the operation.

Parameters
nnumber of elements to alloc
destthe destination of the atomic
resultoutput result

The documentation for this struct was generated from the following file: