NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
scan.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
32 #pragma once
33 
34 #include <nvbio/basic/types.h>
35 #include <nvbio/basic/numbers.h>
36 
37 namespace nvbio {
38 namespace cuda {
39 
43 template <typename T> __device__ __forceinline__ T bit_scan(bool p)
44 {
45  const uint32 mask = __ballot( p );
46  const uint32 pop_scan = __popc( mask << (warpSize - warp_tid() - 1u) );
47  return pop_scan;
48 }
49 
55 template <typename T> __device__ __forceinline__ T scan_warp(T val, const int32 tidx, volatile T *red)
56 {
57  // pad initial segment with zeros
58  red[tidx] = 0;
59  red += 32;
60 
61  // Hillis-Steele scan
62  red[tidx] = val;
63  val += red[tidx-1]; red[tidx] = val;
64  val += red[tidx-2]; red[tidx] = val;
65  val += red[tidx-4]; red[tidx] = val;
66  val += red[tidx-8]; red[tidx] = val;
67  val += red[tidx-16]; red[tidx] = val;
68  return val;
69 }
73 template <typename T> __device__ __forceinline__ T scan_warp_total(volatile T *red) { return red[63]; }
74 
75 // intra-warp inclusive scan
76 //
77 // \param val per-threrad input value
78 // \param tidx warp thread index
79 // \param red scan result storage (2*WARP_SIZE elements)
80 template <typename T, typename Op, uint32 COUNT>
81 struct scan_dispatch {};
82 
83 // intra-warp inclusive scan
84 //
85 // \param val per-threrad input value
86 // \param tidx warp thread index
87 // \param red scan result storage (2*WARP_SIZE elements)
88 template <typename T, typename Op>
89 struct scan_dispatch<T,Op,32>
90 {
91  static __device__ __forceinline__ T scan(T val, const Op op, const int32 tidx, volatile T *red, const T init)
92  {
93  // pad initial segment with zeros
94  red[tidx] = init;
95  red += 32;
96 
97  // Hillis-Steele scan
98  red[tidx] = val;
99  val = op( val, red[tidx-1] ); red[tidx] = val;
100  val = op( val, red[tidx-2] ); red[tidx] = val;
101  val = op( val, red[tidx-4] ); red[tidx] = val;
102  val = op( val, red[tidx-8] ); red[tidx] = val;
103  val = op( val, red[tidx-16] ); red[tidx] = val;
104  return val;
105  }
106  static __device__ __forceinline__ T scan_total(volatile T *red) { return red[63]; }
107 };
108 // intra-warp inclusive scan
109 //
110 // \param val per-threrad input value
111 // \param tidx warp thread index
112 // \param red scan result storage (2*WARP_SIZE elements)
113 template <typename T, typename Op>
114 struct scan_dispatch<T,Op,16>
115 {
116  static __device__ __forceinline__ T scan(T val, const Op op, const int32 tidx, volatile T *red, const T init)
117  {
118  // pad initial segment with zeros
119  red[tidx] = init;
120  red += 32;
121 
122  // Hillis-Steele scan
123  if (tidx < 16)
124  {
125  red[tidx] = val;
126  val = op( val, red[tidx-1] ); red[tidx] = val;
127  val = op( val, red[tidx-2] ); red[tidx] = val;
128  val = op( val, red[tidx-4] ); red[tidx] = val;
129  val = op( val, red[tidx-8] ); red[tidx] = val;
130  }
131  return val;
132  }
133  static __device__ __forceinline__ T scan_total(volatile T *red) { return red[47]; }
134 };
135 // intra-warp inclusive scan
136 //
137 // \param val per-threrad input value
138 // \param tidx warp thread index
139 // \param red scan result storage (2*WARP_SIZE elements)
140 template <typename T, typename Op>
141 struct scan_dispatch<T,Op,8>
142 {
143  static __device__ __forceinline__ T scan(T val, const Op op, const int32 tidx, volatile T *red, const T init)
144  {
145  // pad initial segment with zeros
146  red[tidx] = init;
147  red += 32;
148 
149  // Hillis-Steele scan
150  if (tidx < 8)
151  {
152  red[tidx] = val;
153  val = op( val, red[tidx-1] ); red[tidx] = val;
154  val = op( val, red[tidx-2] ); red[tidx] = val;
155  val = op( val, red[tidx-4] ); red[tidx] = val;
156  }
157  return val;
158  }
159  static __device__ __forceinline__ T scan_total(volatile T *red) { return red[39]; }
160 };
161 // intra-warp inclusive scan
162 //
163 // \param val per-threrad input value
164 // \param tidx warp thread index
165 // \param red scan result storage (2*WARP_SIZE elements)
166 template <typename T, typename Op>
167 struct scan_dispatch<T, Op,4>
168 {
169  static __device__ __forceinline__ T scan(T val, const Op op, const int32 tidx, volatile T *red, const T init)
170  {
171  // pad initial segment with zeros
172  red[tidx] = init;
173  red += 32;
174 
175  // Hillis-Steele scan
176  if (tidx < 4)
177  {
178  red[tidx] = val;
179  val = op( val, red[tidx-1] ); red[tidx] = val;
180  val = op( val, red[tidx-2] ); red[tidx] = val;
181  }
182  return val;
183  }
184  static __device__ __forceinline__ T scan_total(volatile T *red) { return red[35]; }
185 };
186 // intra-warp inclusive scan
187 //
188 // \param val per-threrad input value
189 // \param tidx warp thread index
190 // \param red scan result storage (2*WARP_SIZE elements)
191 template <typename T, typename Op>
192 struct scan_dispatch<T, Op, 2>
193 {
194  static __device__ __forceinline__ T scan(T val, const Op op, const int32 tidx, volatile T *red, const T init)
195  {
196  // pad initial segment with zeros
197  red[tidx] = init;
198  red += 32;
199 
200  // Hillis-Steele scan
201  if (tidx < 2)
202  {
203  red[tidx] = val;
204  val = op( val, red[tidx-1] ); red[tidx] = val;
205  }
206  return val;
207  }
208  static __device__ __forceinline__ T scan_total(volatile T *red) { return red[33]; }
209 };
210 
216 template <uint32 COUNT,typename T, typename Op> __device__ __forceinline__ T scan(T val, const Op op, const T init, volatile T *red)
217 {
218  return scan_dispatch<T,Op,COUNT>::scan( val, op, warp_tid(), red, init );
219 }
225 template <uint32 COUNT,typename T> __device__ __forceinline__ T scan(T val, volatile T *red)
226 {
227  return scan_dispatch<T,add_functor,COUNT>::scan( val, add_functor(), warp_tid(), red, T(0) );
228 }
232 template <uint32 COUNT,typename T> __device__ __forceinline__ T scan_total(volatile T *red)
233 {
235 }
236 
237 
244 __device__ __forceinline__
245 uint32 alloc(uint32 n, uint32* pool, const int32 warp_tid, volatile uint32* warp_red, volatile uint32* warp_broadcast)
246 {
247  uint32 warp_scan = scan_warp( n, warp_tid, warp_red ) - n;
248  uint32 warp_count = scan_warp_total( warp_red );
249  if (warp_tid == 0)
250  *warp_broadcast = atomicAdd( pool, warp_count );
251 
252  return *warp_broadcast + warp_scan;
253 }
254 
260 template <uint32 N>
261 __device__ __forceinline__
262 uint32 alloc(bool pred, uint32* pool, const int32 warp_tid, volatile uint32* warp_broadcast)
263 {
264  const uint32 warp_mask = __ballot( pred );
265  const uint32 warp_count = __popc( warp_mask );
266  const uint32 warp_scan = __popc( warp_mask << (warpSize - warp_tid) );
267 
268  // acquire an offset for this warp
269  if (warp_scan == 0 && pred)
270  *warp_broadcast = atomicAdd( pool, warp_count * N );
271 
272  // find offset
273  return *warp_broadcast + warp_scan * N;
274 }
275 
276 // generalized all primitive
277 template <uint32 COUNT>
278 struct all_dispatch {};
279 
280 // generalized all primitive
281 template <>
282 struct all_dispatch<32>
283 {
284  static __device__ __forceinline__ bool enact(const bool p, volatile uint8* sm)
285  {
286  return __all(p);
287  }
288 };
289 
290 // generalized all primitive
291 template <>
292 struct all_dispatch<2>
293 {
294  static __device__ __forceinline__ bool enact(const bool p, volatile uint8* sm)
295  {
296  const uint32 mask = __ballot(p);
297  const uint32 tid = (threadIdx.x & 31) >> 1;
298  const uint32 tmask = 3u << (tid*2);
299  return (mask & tmask) == tmask;
300  }
301 };
302 
303 // generalized all primitive
304 template <>
305 struct all_dispatch<4>
306 {
307  static __device__ __forceinline__ bool enact(const bool p, volatile uint8* sm)
308  {
309  const uint32 mask = __ballot(p);
310  const uint32 tid = (threadIdx.x & 31) >> 2;
311  const uint32 tmask = 15u << (tid*4);
312  return (mask & tmask) == tmask;
313  }
314 };
315 
316 // generalized all primitive
317 template <>
318 struct all_dispatch<8>
319 {
320  static __device__ __forceinline__ bool enact(const bool p, volatile uint8* sm)
321  {
322  const uint32 mask = __ballot(p);
323  const uint32 tid = (threadIdx.x & 31) >> 3;
324  const uint32 tmask = 255u << (tid*8);
325  return (mask & tmask) == tmask;
326  }
327 };
328 
329 // generalized all primitive
330 template <>
331 struct all_dispatch<16>
332 {
333  static __device__ __forceinline__ bool enact(const bool p, volatile uint8* sm)
334  {
335  const uint32 mask = __ballot(p);
336  const uint32 tid = (threadIdx.x & 31) >> 4;
337  const uint32 tmask = 65535u << (tid*16);
338  return (mask & tmask) == tmask;
339  }
340 };
341 
342 // generalized all primitive
343 template <>
344 struct all_dispatch<64>
345 {
346  static __device__ __forceinline__ bool enact(const bool p, volatile uint8* sm)
347  {
348  sm[ warp_id() ] = __all(p);
349  __syncthreads();
350 
351  // compute the block id
352  const uint32 tid = warp_id() >> 1;
353  return sm[ tid*2 ] & sm[ tid*2 + 1 ];
354  }
355 };
356 // generalized all primitive
357 template <>
358 struct all_dispatch<128>
359 {
360  static __device__ __forceinline__ bool enact(const bool p, volatile uint8* sm)
361  {
362  sm[ warp_id() ] = __all(p);
363  __syncthreads();
364 
365  // check whether all warps in this block are set
366  const uint32 bid = warp_id() >> 2;
367  return __all( sm[ bid * 4 + warp_tid() & 3 ] );
368  }
369 };
370 // generalized all primitive
371 template <>
372 struct all_dispatch<256>
373 {
374  static __device__ __forceinline__ bool enact(const bool p, volatile uint8* sm)
375  {
376  sm[ warp_id() ] = __all(p);
377  __syncthreads();
378 
379  // check whether all warps in this block are set
380  const uint32 bid = warp_id() >> 3;
381  return __all( sm[ bid * 8 + warp_tid() & 7 ] );
382  }
383 };
384 
393 template <uint32 COUNT>
394 __device__ __forceinline__ bool all(const bool p, volatile uint8* sm = NULL)
395 {
396  return all_dispatch<COUNT>::enact( p, sm );
397 }
398 
399 // generalized any primitive
400 template <uint32 COUNT>
401 struct any_dispatch {};
402 
403 // generalized any primitive
404 template <>
405 struct any_dispatch<32>
406 {
407  static __device__ __forceinline__ bool enact(const bool p, volatile uint8* sm)
408  {
409  return __any(p);
410  }
411 };
412 
413 // generalized any primitive
414 template <>
415 struct any_dispatch<2>
416 {
417  static __device__ __forceinline__ bool enact(const bool p, volatile uint8* sm)
418  {
419  const uint32 mask = __ballot(p);
420  const uint32 tid = (threadIdx.x & 31) >> 1;
421  const uint32 tmask = 3u << (tid*2);
422  return (mask & tmask);
423  }
424 };
425 
426 // generalized any primitive
427 template <>
428 struct any_dispatch<4>
429 {
430  static __device__ __forceinline__ bool enact(const bool p, volatile uint8* sm)
431  {
432  const uint32 mask = __ballot(p);
433  const uint32 tid = (threadIdx.x & 31) >> 2;
434  const uint32 tmask = 15u << (tid*4);
435  return (mask & tmask);
436  }
437 };
438 
439 // generalized any primitive
440 template <>
441 struct any_dispatch<8>
442 {
443  static __device__ __forceinline__ bool enact(const bool p, volatile uint8* sm)
444  {
445  const uint32 mask = __ballot(p);
446  const uint32 tid = (threadIdx.x & 31) >> 3;
447  const uint32 tmask = 255u << (tid*8);
448  return (mask & tmask);
449  }
450 };
451 
452 // generalized any primitive
453 template <>
454 struct any_dispatch<16>
455 {
456  static __device__ __forceinline__ bool enact(const bool p, volatile uint8* sm)
457  {
458  const uint32 mask = __ballot(p);
459  const uint32 tid = (threadIdx.x & 31) >> 4;
460  const uint32 tmask = 65535u << (tid*16);
461  return (mask & tmask);
462  }
463 };
464 
465 // generalized any primitive
466 template <>
467 struct any_dispatch<64>
468 {
469  static __device__ __forceinline__ bool enact(const bool p, volatile uint8* sm)
470  {
471  sm[ warp_id() ] = __any(p);
472  __syncthreads();
473 
474  // compute the block id
475  const uint32 tid = warp_id() >> 1;
476  return sm[ tid*2 ] | sm[ tid*2 + 1 ];
477  }
478 };
479 // generalized any primitive
480 template <>
481 struct any_dispatch<128>
482 {
483  static __device__ __forceinline__ bool enact(const bool p, volatile uint8* sm)
484  {
485  sm[ warp_id() ] = __any(p);
486  __syncthreads();
487 
488  // check whether any warps in this block are set
489  const uint32 bid = warp_id() >> 2;
490  return __any( sm[ bid * 4 + warp_tid() & 3 ] );
491  }
492 };
493 // generalized any primitive
494 template <>
495 struct any_dispatch<256>
496 {
497  static __device__ __forceinline__ bool enact(const bool p, volatile uint8* sm)
498  {
499  sm[ warp_id() ] = __any(p);
500  __syncthreads();
501 
502  // check whether any warps in this block are set
503  const uint32 bid = warp_id() >> 3;
504  return __any( sm[ bid * 8 + warp_tid() & 7 ] );
505  }
506 };
507 
516 template <uint32 COUNT>
517 __device__ __forceinline__ bool any(const bool p, volatile uint8* sm = NULL)
518 {
519  return any_dispatch<COUNT>::enact( p, sm );
520 }
521 
522 } // namespace cuda
523 } // namespace nvbio