36 #include <cugar/basic/types.h> 37 #include <cugar/bits/popcount.h> 38 #include <cub/cub.cuh> 55 __device__ __forceinline__
58 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 59 const unsigned int lane_id = threadIdx.x & 31;
62 int mask = __match_all_sync(__activemask(), (
unsigned long long)ptr, &pred);
63 int leader = __ffs(mask) - 1;
66 if (lane_id == leader)
67 res = atomicAdd(ptr, __popc(mask));
69 res = __shfl_sync(mask, res, leader);
70 return res + __popc(mask & ((1 << lane_id) - 1));
72 const uint32 warp_tid = threadIdx.x & 31;
73 const uint32 warp_mask = __ballot_sync( __activemask(),
true );
74 const uint32 warp_count = __popc( warp_mask );
75 const uint32 warp_scan = __popc( warp_mask << (warpSize - warp_tid) );
77 const uint32 first_tid =
ffs(warp_mask) - 1;
79 uint32 broadcast_offset = 0;
82 if (warp_tid == first_tid)
83 broadcast_offset = atomicAdd( ptr, warp_count );
86 const uint32 offset = cub::ShuffleIndex<32>(broadcast_offset, first_tid, __activemask());
89 return offset + warp_scan;
103 __device__ __forceinline__
111 __device__ __forceinline__
114 warp_static_atomic::static_add<N>( p, m_dest );
121 __device__ __forceinline__
124 warp_static_atomic::static_sub<N>( p, m_dest );
131 __device__ __forceinline__
132 void add(
bool p, uint32* result)
134 warp_static_atomic::static_add<N>( p, m_dest, result );
141 __device__ __forceinline__
142 void sub(
bool p, uint32* result)
144 warp_static_atomic::static_sub<N>( p, m_dest, result );
154 __device__ __forceinline__
157 const uint32 warp_tid = threadIdx.x & 31;
158 const uint32 warp_mask = __ballot_sync( __activemask(), p );
159 const uint32 warp_count = __popc( warp_mask );
160 const uint32 warp_scan = __popc( warp_mask << (warpSize - warp_tid) );
163 if (warp_scan == 0 && p)
164 atomicAdd( dest, warp_count * N );
172 __device__ __forceinline__
175 const uint32 warp_tid = threadIdx.x & 31;
176 const uint32 warp_mask = __ballot_sync( __activemask(), p );
177 const uint32 warp_count = __popc( warp_mask );
178 const uint32 warp_scan = __popc( warp_mask << (warpSize - warp_tid) );
181 if (warp_scan == 0 && p)
182 atomicSub( dest, warp_count * N );
191 __device__ __forceinline__
194 const uint32 warp_tid = threadIdx.x & 31;
195 const uint32 warp_mask = __ballot_sync( __activemask(), p );
196 const uint32 warp_count = __popc( warp_mask );
197 const uint32 warp_scan = __popc( warp_mask << (warpSize - warp_tid) );
199 const uint32 first_tid =
ffs(warp_mask) - 1;
201 uint32 broadcast_offset;
204 if (warp_tid == first_tid)
205 broadcast_offset = atomicAdd( dest, warp_count * N );
208 const uint32 offset = cub::ShuffleIndex<32>(broadcast_offset, first_tid, __activemask());
211 *result = offset + warp_scan * N;
220 __device__ __forceinline__
223 const uint32 warp_tid = threadIdx.x & 31;
224 const uint32 warp_mask = __ballot_sync( __activemask(), p );
225 const uint32 warp_count = __popc( warp_mask );
226 const uint32 warp_scan = __popc( warp_mask << (warpSize - warp_tid) );
228 const uint32 first_tid =
ffs(warp_mask) - 1;
230 uint32 broadcast_offset;
233 if (warp_tid == first_tid)
234 broadcast_offset = atomicSub( dest, warp_count * N );
237 const uint32 offset = cub::ShuffleIndex<32>(broadcast_offset, first_tid, __activemask());
240 *result = offset - warp_scan * N;
257 typename cub::WarpScan<uint32>::TempStorage scan_storage;
258 typename cub::WarpReduce<uint32>::TempStorage reduce_storage;
264 __device__ __forceinline__
266 : m_dest(dest), m_temp_storage(temp_storage) {}
272 __device__ __forceinline__
281 __device__ __forceinline__
293 __device__ __forceinline__
296 return warp_atomic::static_add<N>( p, m_dest );
305 __device__ __forceinline__
308 warp_atomic::static_sub<N>( p, m_dest );
317 __device__ __forceinline__
318 void add(uint32 n, uint32* result)
329 __device__ __forceinline__
330 void sub(uint32 n, uint32* result)
340 __device__ __forceinline__
341 void add(
bool p, uint32* result)
343 return warp_atomic::static_add<N>( p, m_dest, result );
351 __device__ __forceinline__
352 void sub(
bool p, uint32* result)
354 return warp_atomic::static_sub<N>( p, m_dest, result );
365 __device__ __forceinline__
369 const uint32 warp_count = cub::WarpReduce<uint32>(temp_storage.reduce_storage).Sum(n);
372 const uint32 warp_tid = threadIdx.x & 31;
374 atomicAdd( dest, warp_count );
383 __device__ __forceinline__
387 const uint32 warp_count = cub::WarpReduce<uint32>(temp_storage.reduce_storage).Sum(n);
390 const uint32 warp_tid = threadIdx.x & 31;
392 atomicSub( dest, warp_count );
402 __device__ __forceinline__
405 uint32 warp_scan, warp_count;
408 cub::WarpScan<uint32>(temp_storage.scan_storage).ExclusiveSum(n, warp_scan, warp_count);
410 const uint32 warp_tid = threadIdx.x & 31;
415 base_index = atomicAdd( dest, warp_count );
418 *result = cub::ShuffleIndex<32>( base_index, 0, __activemask() ) + warp_scan;
428 __device__ __forceinline__
431 uint32 warp_scan, warp_count;
434 cub::WarpScan<uint32>(temp_storage.scan_storage).ExclusiveSum(n, warp_scan, warp_count);
436 const uint32 warp_tid = threadIdx.x & 31;
441 base_index = atomicSub( dest, warp_count );
444 *result = cub::ShuffleIndex<32>( base_index, 0, __activemask() ) - warp_scan;
452 __device__ __forceinline__
455 return warp_static_atomic::static_add<N>( p, dest );
463 __device__ __forceinline__
466 warp_static_atomic::static_sub<N>( p, dest );
475 __device__ __forceinline__
478 return warp_static_atomic::static_add<N>( p, dest, result );
487 __device__ __forceinline__
490 warp_static_atomic::static_sub<N>( p, dest, result );
__device__ static __forceinline__ void static_sub(bool p, uint32 *dest, uint32 *result)
Definition: warp_atomics.h:221
__device__ __forceinline__ unsigned int warp_increment(unsigned int *ptr)
Definition: warp_atomics.h:56
__device__ __forceinline__ void sub(bool p, uint32 *result)
Definition: warp_atomics.h:142
__device__ static __forceinline__ void add(uint32 n, uint32 *dest, temp_storage_type &temp_storage)
Definition: warp_atomics.h:366
__device__ __forceinline__ void sub(uint32 n)
Definition: warp_atomics.h:282
__device__ __forceinline__ void sub(uint32 n, uint32 *result)
Definition: warp_atomics.h:330
__device__ __forceinline__ void sub(bool p, uint32 *result)
Definition: warp_atomics.h:352
__device__ static __forceinline__ void static_add(bool p, uint32 *dest, uint32 *result)
Definition: warp_atomics.h:192
__device__ __forceinline__ void add(bool p, uint32 *result)
Definition: warp_atomics.h:132
__device__ __forceinline__ void sub(bool p)
Definition: warp_atomics.h:122
__device__ __forceinline__ warp_static_atomic(uint32 *pool)
Definition: warp_atomics.h:104
__device__ __forceinline__ void add(bool p)
Definition: warp_atomics.h:294
__device__ static __forceinline__ void static_sub(bool p, uint32 *dest, uint32 *result)
Definition: warp_atomics.h:488
__device__ __forceinline__ void add(uint32 n)
Definition: warp_atomics.h:273
__device__ static __forceinline__ void add(uint32 n, uint32 *dest, uint32 *result, temp_storage_type &temp_storage)
Definition: warp_atomics.h:403
__device__ static __forceinline__ void static_add(bool p, uint32 *dest)
Definition: warp_atomics.h:155
__device__ __forceinline__ void add(bool p, uint32 *result)
Definition: warp_atomics.h:341
__device__ static __forceinline__ void static_sub(bool p, uint32 *dest)
Definition: warp_atomics.h:173
__device__ static __forceinline__ void static_sub(bool p, uint32 *dest)
Definition: warp_atomics.h:464
__device__ static __forceinline__ void sub(uint32 n, uint32 *dest, uint32 *result, temp_storage_type &temp_storage)
Definition: warp_atomics.h:429
Define a vector_view POD type and plain_view() for std::vector.
Definition: diff.h:38
__device__ static __forceinline__ void static_add(bool p, uint32 *dest)
Definition: warp_atomics.h:453
Definition: warp_atomics.h:252
Definition: warp_atomics.h:99
__device__ __forceinline__ void sub(bool p)
Definition: warp_atomics.h:306
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE uint32 ffs(const int32 x)
Definition: popcount_inl.h:206
__device__ __forceinline__ void add(uint32 n, uint32 *result)
Definition: warp_atomics.h:318
__device__ __forceinline__ void add(bool p)
Definition: warp_atomics.h:112
Definition: warp_atomics.h:254
__device__ static __forceinline__ void sub(uint32 n, uint32 *dest, temp_storage_type &temp_storage)
Definition: warp_atomics.h:384
__device__ __forceinline__ warp_atomic(uint32 *dest, temp_storage_type &temp_storage)
Definition: warp_atomics.h:265
__device__ static __forceinline__ void static_add(bool p, uint32 *dest, uint32 *result)
Definition: warp_atomics.h:476