43 template <
typename T> __device__ __forceinline__ T
bit_scan(
bool p)
45 const uint32 mask = __ballot( p );
46 const uint32 pop_scan = __popc( mask << (warpSize - warp_tid() - 1u) );
55 template <
typename T> __device__ __forceinline__ T
scan_warp(T val,
const int32 tidx,
volatile T *red)
63 val += red[tidx-1]; red[tidx] = val;
64 val += red[tidx-2]; red[tidx] = val;
65 val += red[tidx-4]; red[tidx] = val;
66 val += red[tidx-8]; red[tidx] = val;
67 val += red[tidx-16]; red[tidx] = val;
73 template <
typename T> __device__ __forceinline__ T
scan_warp_total(
volatile T *red) {
return red[63]; }
80 template <
typename T,
typename Op, u
int32 COUNT>
88 template <
typename T,
typename Op>
91 static __device__ __forceinline__ T
scan(T val,
const Op op,
const int32 tidx,
volatile T *red,
const T init)
99 val = op( val, red[tidx-1] ); red[tidx] = val;
100 val = op( val, red[tidx-2] ); red[tidx] = val;
101 val = op( val, red[tidx-4] ); red[tidx] = val;
102 val = op( val, red[tidx-8] ); red[tidx] = val;
103 val = op( val, red[tidx-16] ); red[tidx] = val;
106 static __device__ __forceinline__ T
scan_total(
volatile T *red) {
return red[63]; }
113 template <
typename T,
typename Op>
116 static __device__ __forceinline__ T
scan(T val,
const Op op,
const int32 tidx,
volatile T *red,
const T init)
126 val = op( val, red[tidx-1] ); red[tidx] = val;
127 val = op( val, red[tidx-2] ); red[tidx] = val;
128 val = op( val, red[tidx-4] ); red[tidx] = val;
129 val = op( val, red[tidx-8] ); red[tidx] = val;
133 static __device__ __forceinline__ T
scan_total(
volatile T *red) {
return red[47]; }
140 template <
typename T,
typename Op>
143 static __device__ __forceinline__ T
scan(T val,
const Op op,
const int32 tidx,
volatile T *red,
const T init)
153 val = op( val, red[tidx-1] ); red[tidx] = val;
154 val = op( val, red[tidx-2] ); red[tidx] = val;
155 val = op( val, red[tidx-4] ); red[tidx] = val;
159 static __device__ __forceinline__ T
scan_total(
volatile T *red) {
return red[39]; }
166 template <
typename T,
typename Op>
169 static __device__ __forceinline__ T
scan(T val,
const Op op,
const int32 tidx,
volatile T *red,
const T init)
179 val = op( val, red[tidx-1] ); red[tidx] = val;
180 val = op( val, red[tidx-2] ); red[tidx] = val;
184 static __device__ __forceinline__ T
scan_total(
volatile T *red) {
return red[35]; }
191 template <
typename T,
typename Op>
194 static __device__ __forceinline__ T
scan(T val,
const Op op,
const int32 tidx,
volatile T *red,
const T init)
204 val = op( val, red[tidx-1] ); red[tidx] = val;
208 static __device__ __forceinline__ T
scan_total(
volatile T *red) {
return red[33]; }
216 template <u
int32 COUNT,
typename T,
typename Op> __device__ __forceinline__ T
scan(T val,
const Op op,
const T init,
volatile T *red)
225 template <u
int32 COUNT,
typename T> __device__ __forceinline__ T
scan(T val,
volatile T *red)
232 template <u
int32 COUNT,
typename T> __device__ __forceinline__ T
scan_total(
volatile T *red)
244 __device__ __forceinline__
250 *warp_broadcast = atomicAdd( pool, warp_count );
252 return *warp_broadcast + warp_scan;
261 __device__ __forceinline__
264 const uint32 warp_mask = __ballot( pred );
265 const uint32 warp_count = __popc( warp_mask );
266 const uint32 warp_scan = __popc( warp_mask << (warpSize - warp_tid) );
269 if (warp_scan == 0 && pred)
270 *warp_broadcast = atomicAdd( pool, warp_count * N );
273 return *warp_broadcast + warp_scan * N;
277 template <u
int32 COUNT>
284 static __device__ __forceinline__
bool enact(
const bool p,
volatile uint8* sm)
294 static __device__ __forceinline__
bool enact(
const bool p,
volatile uint8* sm)
296 const uint32 mask = __ballot(p);
297 const uint32 tid = (threadIdx.x & 31) >> 1;
298 const uint32 tmask = 3u << (tid*2);
299 return (mask & tmask) == tmask;
307 static __device__ __forceinline__
bool enact(
const bool p,
volatile uint8* sm)
309 const uint32 mask = __ballot(p);
310 const uint32 tid = (threadIdx.x & 31) >> 2;
311 const uint32 tmask = 15u << (tid*4);
312 return (mask & tmask) == tmask;
320 static __device__ __forceinline__
bool enact(
const bool p,
volatile uint8* sm)
322 const uint32 mask = __ballot(p);
323 const uint32 tid = (threadIdx.x & 31) >> 3;
324 const uint32 tmask = 255u << (tid*8);
325 return (mask & tmask) == tmask;
333 static __device__ __forceinline__
bool enact(
const bool p,
volatile uint8* sm)
335 const uint32 mask = __ballot(p);
336 const uint32 tid = (threadIdx.x & 31) >> 4;
337 const uint32 tmask = 65535u << (tid*16);
338 return (mask & tmask) == tmask;
346 static __device__ __forceinline__
bool enact(
const bool p,
volatile uint8* sm)
348 sm[ warp_id() ] = __all(p);
352 const uint32 tid = warp_id() >> 1;
353 return sm[ tid*2 ] & sm[ tid*2 + 1 ];
360 static __device__ __forceinline__
bool enact(
const bool p,
volatile uint8* sm)
362 sm[ warp_id() ] = __all(p);
366 const uint32 bid = warp_id() >> 2;
367 return __all( sm[ bid * 4 + warp_tid() & 3 ] );
374 static __device__ __forceinline__
bool enact(
const bool p,
volatile uint8* sm)
376 sm[ warp_id() ] = __all(p);
380 const uint32 bid = warp_id() >> 3;
381 return __all( sm[ bid * 8 + warp_tid() & 7 ] );
393 template <u
int32 COUNT>
394 __device__ __forceinline__
bool all(
const bool p,
volatile uint8* sm = NULL)
400 template <u
int32 COUNT>
407 static __device__ __forceinline__
bool enact(
const bool p,
volatile uint8* sm)
417 static __device__ __forceinline__
bool enact(
const bool p,
volatile uint8* sm)
419 const uint32 mask = __ballot(p);
420 const uint32 tid = (threadIdx.x & 31) >> 1;
421 const uint32 tmask = 3u << (tid*2);
422 return (mask & tmask);
430 static __device__ __forceinline__
bool enact(
const bool p,
volatile uint8* sm)
432 const uint32 mask = __ballot(p);
433 const uint32 tid = (threadIdx.x & 31) >> 2;
434 const uint32 tmask = 15u << (tid*4);
435 return (mask & tmask);
443 static __device__ __forceinline__
bool enact(
const bool p,
volatile uint8* sm)
445 const uint32 mask = __ballot(p);
446 const uint32 tid = (threadIdx.x & 31) >> 3;
447 const uint32 tmask = 255u << (tid*8);
448 return (mask & tmask);
456 static __device__ __forceinline__
bool enact(
const bool p,
volatile uint8* sm)
458 const uint32 mask = __ballot(p);
459 const uint32 tid = (threadIdx.x & 31) >> 4;
460 const uint32 tmask = 65535u << (tid*16);
461 return (mask & tmask);
469 static __device__ __forceinline__
bool enact(
const bool p,
volatile uint8* sm)
471 sm[ warp_id() ] = __any(p);
475 const uint32 tid = warp_id() >> 1;
476 return sm[ tid*2 ] | sm[ tid*2 + 1 ];
483 static __device__ __forceinline__
bool enact(
const bool p,
volatile uint8* sm)
485 sm[ warp_id() ] = __any(p);
489 const uint32 bid = warp_id() >> 2;
490 return __any( sm[ bid * 4 + warp_tid() & 3 ] );
497 static __device__ __forceinline__
bool enact(
const bool p,
volatile uint8* sm)
499 sm[ warp_id() ] = __any(p);
503 const uint32 bid = warp_id() >> 3;
504 return __any( sm[ bid * 8 + warp_tid() & 7 ] );
516 template <u
int32 COUNT>
517 __device__ __forceinline__
bool any(
const bool p,
volatile uint8* sm = NULL)