32 #include <cuda_runtime.h>
40 #if DO_OPTIONAL_SYNCHRONIZE
41 cudaDeviceSynchronize();
60 #if USE_WARP_SYNCHRONOUS_QUEUES
61 const uint32 mask = __ballot(
true );
62 const uint32 pop_scan = __popc( mask << (32u - warp_tid()) );
63 const uint32 pop_count = __popc( mask );
65 *warp_broadcast = atomicAdd( counter, pop_count );
67 return *warp_broadcast + pop_scan;
69 return atomicAdd( counter, 1u );