Detailed description

An efficient warp-synchronous atomic adder, to add or subtract to a shared integer.

Given a pointer to an integer (e.g. representing a "pool"), this class allows the threads in a warp to add (allocate) or subtract (deallocate) a per-thread integer to it.

#include <warp_atomics.h>

Classes
struct	temp_storage_type

Public Methods
__device__ __forceinline__	warp_atomic (uint32 *dest, temp_storage_type &temp_storage)

__device__ __forceinline__ void	add (uint32 n)

__device__ __forceinline__ void	sub (uint32 n)

template<uint32 N>
__device__ __forceinline__ void	add (bool p)

template<uint32 N>
__device__ __forceinline__ void	sub (bool p)

__device__ __forceinline__ void	add (uint32 n, uint32 *result)

__device__ __forceinline__ void	sub (uint32 n, uint32 *result)

template<uint32 N>
__device__ __forceinline__ void	add (bool p, uint32 *result)

template<uint32 N>
__device__ __forceinline__ void	sub (bool p, uint32 *result)

Static Public Methods
__device__ static __forceinline__ void	add (uint32 n, uint32 *dest, temp_storage_type &temp_storage)

__device__ static __forceinline__ void	sub (uint32 n, uint32 *dest, temp_storage_type &temp_storage)

__device__ static __forceinline__ void	add (uint32 n, uint32 dest, uint32 result, temp_storage_type &temp_storage)

__device__ static __forceinline__ void	sub (uint32 n, uint32 dest, uint32 result, temp_storage_type &temp_storage)

template<uint32 N>
__device__ static __forceinline__ void	static_add (bool p, uint32 *dest)

template<uint32 N>
__device__ static __forceinline__ void	static_sub (bool p, uint32 *dest)

template<uint32 N>
__device__ static __forceinline__ void	static_add (bool p, uint32 dest, uint32 result)

template<uint32 N>
__device__ static __forceinline__ void	static_sub (bool p, uint32 dest, uint32 result)

Constructor & Destructor Documentation

◆ warp_atomic()

__device__ __forceinline__ cugar::cuda::warp_atomic::warp_atomic	(	uint32 *	dest,
		temp_storage_type &	temp_storage
	)

inline

stateful object constructor

Member Function Documentation

◆ add() [1/6]

__device__ __forceinline__ void cugar::cuda::warp_atomic::add ( uint32 n )

inline

add a per-thread value to the shared integer without waiting for the result

Parameters

n	number of elements to alloc

◆ add() [2/6]

template<uint32 N>

__device__ __forceinline__ void cugar::cuda::warp_atomic::add ( bool p )

inline

add zero or exactly N per thread to a shared value without waiting for the result: useful to alloc N entries from a common pool NOTE that this class internally uses a synchronous warp reduction, and as such it requires all threads to participate in the operation.

Parameters

p	allocation predicate

◆ add() [3/6]

__device__ __forceinline__ void cugar::cuda::warp_atomic::add	(	uint32	n,
		uint32 *	result
	)

inline

add a per-thread value to the shared integer: useful to alloc entries from a common pool NOTE that this class internally uses a synchronous warp scan, and as such it requires all threads to participate in the operation.

Parameters

n	number of elements to alloc
result	output result

◆ add() [4/6]

template<uint32 N>

__device__ __forceinline__ void cugar::cuda::warp_atomic::add	(	bool	p,
		uint32 *	result
	)

inline

add zero or exactly N per thread to a shared value: useful to alloc N entries from a common pool

Parameters

p	allocation predicate
result	output result

◆ add() [5/6]

__device__ static __forceinline__ void cugar::cuda::warp_atomic::add	(	uint32	n,
		uint32 *	dest,
		temp_storage_type &	temp_storage
	)

inlinestatic

add a per-thread value to the shared integer without waiting for the result NOTE that this class internally uses a synchronous warp reduction, and as such it requires all threads to participate in the operation.

Parameters

n	number of elements to alloc
dest	the destination of the atomic

◆ add() [6/6]

__device__ static __forceinline__ void cugar::cuda::warp_atomic::add	(	uint32	n,
		uint32 *	dest,
		uint32 *	result,
		temp_storage_type &	temp_storage
	)

inlinestatic

add a per-thread value to the shared integer: useful to alloc entries from a common pool NOTE that this class internally uses a synchronous warp scan, and as such it requires all threads to participate in the operation.

Parameters

n	number of elements to alloc
dest	the destination of the atomic
result	output result

◆ static_add() [1/2]

template<uint32 N>

__device__ static __forceinline__ void cugar::cuda::warp_atomic::static_add	(	bool	p,
		uint32 *	dest
	)

inlinestatic

add zero or exactly N per thread to a shared value without waiting for the result: useful to dealloc N entries from a common pool

Parameters

p	allocation predicate
dest	the destination of the atomic

◆ static_add() [2/2]

template<uint32 N>

__device__ static __forceinline__ void cugar::cuda::warp_atomic::static_add	(	bool	p,
		uint32 *	dest,
		uint32 *	result
	)

inlinestatic

add zero or exactly N per thread to a shared value: useful to dealloc N entries from a common pool

Parameters

p	allocation predicate
dest	the destination of the atomic
result	output result

◆ static_sub() [1/2]

template<uint32 N>

__device__ static __forceinline__ void cugar::cuda::warp_atomic::static_sub	(	bool	p,
		uint32 *	dest
	)

inlinestatic

subtract zero or exactly N per thread to a shared value without waiting for the result: useful to dealloc N entries from a common pool

Parameters

p	allocation predicate
dest	the destination of the atomic

◆ static_sub() [2/2]

template<uint32 N>

__device__ static __forceinline__ void cugar::cuda::warp_atomic::static_sub	(	bool	p,
		uint32 *	dest,
		uint32 *	result
	)

inlinestatic

subtract zero or exactly N per thread to a shared value: useful to dealloc N entries from a common pool

Parameters

p	allocation predicate
dest	the destination of the atomic
result	output result

◆ sub() [1/6]

__device__ __forceinline__ void cugar::cuda::warp_atomic::sub ( uint32 n )

inline

add a per-thread value to the shared integer without waiting for the result

Parameters

n	number of elements to alloc

◆ sub() [2/6]

template<uint32 N>

__device__ __forceinline__ void cugar::cuda::warp_atomic::sub ( bool p )

inline

subtract zero or exactly N per thread to a shared value without waiting for the result: useful to dealloc N entries from a common pool NOTE that this class internally uses a synchronous warp reduction, and as such it requires all threads to participate in the operation.

Parameters

p	allocation predicate

◆ sub() [3/6]

__device__ __forceinline__ void cugar::cuda::warp_atomic::sub	(	uint32	n,
		uint32 *	result
	)

inline

subtract a per-thread value to the shared integer: useful to dealloc entries from a common pool NOTE that this class internally uses a synchronous warp scan, and as such it requires all threads to participate in the operation.

Parameters

n	number of elements to alloc
result	output result

◆ sub() [4/6]

template<uint32 N>

__device__ __forceinline__ void cugar::cuda::warp_atomic::sub	(	bool	p,
		uint32 *	result
	)

inline

subtract zero or exactly N per thread to a shared value: useful to dealloc N entries from a common pool

Parameters

p	allocation predicate
result	output result

◆ sub() [5/6]

__device__ static __forceinline__ void cugar::cuda::warp_atomic::sub	(	uint32	n,
		uint32 *	dest,
		temp_storage_type &	temp_storage
	)

inlinestatic

subtract a per-thread value to the shared integer without waiting for the result NOTE that this class internally uses a synchronous warp reduction, and as such it requires all threads to participate in the operation.

Parameters

n	number of elements to alloc
dest	the destination of the atomic

◆ sub() [6/6]

__device__ static __forceinline__ void cugar::cuda::warp_atomic::sub	(	uint32	n,
		uint32 *	dest,
		uint32 *	result,
		temp_storage_type &	temp_storage
	)

inlinestatic

subtract a per-thread value to the shared integer: useful to dealloc entries from a common pool NOTE that this class internally uses a synchronous warp scan, and as such it requires all threads to participate in the operation.

Parameters

n	number of elements to alloc
dest	the destination of the atomic
result	output result

The documentation for this struct was generated from the following file:

C:/p4research/research/jpantaleoni/Fermat-Public/contrib/cugar/basic/cuda/warp_atomics.h

Detailed description

Classes

Public Methods

Static Public Methods

Constructor & Destructor Documentation

◆ warp_atomic()

Member Function Documentation

◆ add() [1/6]

◆ add() [2/6]

◆ add() [3/6]

◆ add() [4/6]

◆ add() [5/6]

◆ add() [6/6]

◆ static_add() [1/2]

◆ static_add() [2/2]

◆ static_sub() [1/2]

◆ static_sub() [2/2]

◆ sub() [1/6]

◆ sub() [2/6]

◆ sub() [3/6]

◆ sub() [4/6]

◆ sub() [5/6]

◆ sub() [6/6]