Fermat
warp_atomics.h
Go to the documentation of this file.
1 /*
2  * CUGAR : Cuda Graphics Accelerator
3  *
4  * Copyright (c) 2010-2018, NVIDIA Corporation
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  * * Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  * * Redistributions in binary form must reproduce the above copyright
12  * notice, this list of conditions and the following disclaimer in the
13  * documentation and/or other materials provided with the distribution.
14  * * Neither the name of NVIDIA Corporation nor the
15  * names of its contributors may be used to endorse or promote products
16  * derived from this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21  * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
22  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 
34 #pragma once
35 
36 #include <cugar/basic/types.h>
37 #include <cugar/bits/popcount.h>
38 #include <cub/cub.cuh>
39 
40 namespace cugar {
41 namespace cuda {
42 
45 
48 
52 
55 __device__ __forceinline__
56 unsigned int warp_increment(unsigned int *ptr)
57 {
58 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
59  const unsigned int lane_id = threadIdx.x & 31;
60 
61  int pred;
62  int mask = __match_all_sync(__activemask(), (unsigned long long)ptr, &pred);
63  int leader = __ffs(mask) - 1; // select a leader
64 
65  int res = 0;
66  if (lane_id == leader) // leader does the update
67  res = atomicAdd(ptr, __popc(mask));
68 
69  res = __shfl_sync(mask, res, leader); // get leader’s old value
70  return res + __popc(mask & ((1 << lane_id) - 1)); //compute old value
71 #else
72  const uint32 warp_tid = threadIdx.x & 31;
73  const uint32 warp_mask = __ballot_sync( __activemask(), true );
74  const uint32 warp_count = __popc( warp_mask );
75  const uint32 warp_scan = __popc( warp_mask << (warpSize - warp_tid) );
76 
77  const uint32 first_tid = ffs(warp_mask) - 1;
78 
79  uint32 broadcast_offset = 0;
80 
81  // acquire an offset for this warp
82  if (warp_tid == first_tid)
83  broadcast_offset = atomicAdd( ptr, warp_count );
84 
85  // obtain the offset from the first participating thread
86  const uint32 offset = cub::ShuffleIndex<32>(broadcast_offset, first_tid, __activemask());
87 
88  // compute the per-thread offset
89  return offset + warp_scan;
90 #endif
91 }
92 
100 {
103  __device__ __forceinline__
104  warp_static_atomic(uint32* pool)
105  : m_dest(pool) {}
106 
110  template <uint32 N>
111  __device__ __forceinline__
112  void add(bool p)
113  {
114  warp_static_atomic::static_add<N>( p, m_dest );
115  }
116 
120  template <uint32 N>
121  __device__ __forceinline__
122  void sub(bool p)
123  {
124  warp_static_atomic::static_sub<N>( p, m_dest );
125  }
126 
130  template <uint32 N>
131  __device__ __forceinline__
132  void add(bool p, uint32* result)
133  {
134  warp_static_atomic::static_add<N>( p, m_dest, result );
135  }
136 
140  template <uint32 N>
141  __device__ __forceinline__
142  void sub(bool p, uint32* result)
143  {
144  warp_static_atomic::static_sub<N>( p, m_dest, result );
145  }
146 
147  // --- stateless methods --- //
148 
153  template <uint32 N>
154  __device__ __forceinline__
155  static void static_add(bool p, uint32* dest)
156  {
157  const uint32 warp_tid = threadIdx.x & 31;
158  const uint32 warp_mask = __ballot_sync( __activemask(), p );
159  const uint32 warp_count = __popc( warp_mask );
160  const uint32 warp_scan = __popc( warp_mask << (warpSize - warp_tid) );
161 
162  // perform the atomic
163  if (warp_scan == 0 && p)
164  atomicAdd( dest, warp_count * N );
165  }
166 
171  template <uint32 N>
172  __device__ __forceinline__
173  static void static_sub(bool p, uint32* dest)
174  {
175  const uint32 warp_tid = threadIdx.x & 31;
176  const uint32 warp_mask = __ballot_sync( __activemask(), p );
177  const uint32 warp_count = __popc( warp_mask );
178  const uint32 warp_scan = __popc( warp_mask << (warpSize - warp_tid) );
179 
180  // perform the atomic
181  if (warp_scan == 0 && p)
182  atomicSub( dest, warp_count * N );
183  }
184 
190  template <uint32 N>
191  __device__ __forceinline__
192  static void static_add(bool p, uint32* dest, uint32* result)
193  {
194  const uint32 warp_tid = threadIdx.x & 31;
195  const uint32 warp_mask = __ballot_sync( __activemask(), p );
196  const uint32 warp_count = __popc( warp_mask );
197  const uint32 warp_scan = __popc( warp_mask << (warpSize - warp_tid) );
198 
199  const uint32 first_tid = ffs(warp_mask) - 1;
200 
201  uint32 broadcast_offset;
202 
203  // acquire an offset for this warp
204  if (warp_tid == first_tid)
205  broadcast_offset = atomicAdd( dest, warp_count * N );
206 
207  // obtain the offset from the first participating thread
208  const uint32 offset = cub::ShuffleIndex<32>(broadcast_offset, first_tid, __activemask());
209 
210  // compute the per-thread offset
211  *result = offset + warp_scan * N;
212  }
213 
219  template <uint32 N>
220  __device__ __forceinline__
221  static void static_sub(bool p, uint32* dest, uint32* result)
222  {
223  const uint32 warp_tid = threadIdx.x & 31;
224  const uint32 warp_mask = __ballot_sync( __activemask(), p );
225  const uint32 warp_count = __popc( warp_mask );
226  const uint32 warp_scan = __popc( warp_mask << (warpSize - warp_tid) );
227 
228  const uint32 first_tid = ffs(warp_mask) - 1;
229 
230  uint32 broadcast_offset;
231 
232  // acquire an offset for this warp
233  if (warp_tid == first_tid)
234  broadcast_offset = atomicSub( dest, warp_count * N );
235 
236  // obtain the offset from the first participating thread
237  const uint32 offset = cub::ShuffleIndex<32>(broadcast_offset, first_tid, __activemask());
238 
239  // compute the per-thread offset
240  *result = offset - warp_scan * N;
241  }
242 
243 private:
244  uint32* m_dest;
245 };
246 
253 {
255  {
256  union {
257  typename cub::WarpScan<uint32>::TempStorage scan_storage;
258  typename cub::WarpReduce<uint32>::TempStorage reduce_storage;
259  };
260  };
261 
264  __device__ __forceinline__
265  warp_atomic(uint32* dest, temp_storage_type& temp_storage)
266  : m_dest(dest), m_temp_storage(temp_storage) {}
267 
268 
272  __device__ __forceinline__
273  void add(uint32 n)
274  {
275  warp_atomic::add( n, m_dest, m_temp_storage );
276  }
277 
281  __device__ __forceinline__
282  void sub(uint32 n)
283  {
284  warp_atomic::sub( n, m_dest, m_temp_storage );
285  }
286 
292  template <uint32 N>
293  __device__ __forceinline__
294  void add(bool p)
295  {
296  return warp_atomic::static_add<N>( p, m_dest );
297  }
298 
304  template <uint32 N>
305  __device__ __forceinline__
306  void sub(bool p)
307  {
308  warp_atomic::static_sub<N>( p, m_dest );
309  }
310 
317  __device__ __forceinline__
318  void add(uint32 n, uint32* result)
319  {
320  warp_atomic::add( n, m_dest, result, m_temp_storage );
321  }
322 
329  __device__ __forceinline__
330  void sub(uint32 n, uint32* result)
331  {
332  warp_atomic::sub( n, m_dest, result, m_temp_storage );
333  }
334 
339  template <uint32 N>
340  __device__ __forceinline__
341  void add(bool p, uint32* result)
342  {
343  return warp_atomic::static_add<N>( p, m_dest, result );
344  }
345 
350  template <uint32 N>
351  __device__ __forceinline__
352  void sub(bool p, uint32* result)
353  {
354  return warp_atomic::static_sub<N>( p, m_dest, result );
355  }
356 
357  // --- stateless methods --- //
358 
365  __device__ __forceinline__
366  static void add(uint32 n, uint32* dest, temp_storage_type& temp_storage)
367  {
368  // issue a warp-reduction
369  const uint32 warp_count = cub::WarpReduce<uint32>(temp_storage.reduce_storage).Sum(n);
370 
371  // issue a per-warp atomic
372  const uint32 warp_tid = threadIdx.x & 31;
373  if (warp_tid == 0)
374  atomicAdd( dest, warp_count );
375  }
376 
383  __device__ __forceinline__
384  static void sub(uint32 n, uint32* dest, temp_storage_type& temp_storage)
385  {
386  // issue a warp-reduction
387  const uint32 warp_count = cub::WarpReduce<uint32>(temp_storage.reduce_storage).Sum(n);
388 
389  // issue a per-warp atomic
390  const uint32 warp_tid = threadIdx.x & 31;
391  if (warp_tid == 0)
392  atomicSub( dest, warp_count );
393  }
394 
402  __device__ __forceinline__
403  static void add(uint32 n, uint32* dest, uint32* result, temp_storage_type& temp_storage)
404  {
405  uint32 warp_scan, warp_count;
406 
407  // issue a warp-scan
408  cub::WarpScan<uint32>(temp_storage.scan_storage).ExclusiveSum(n, warp_scan, warp_count);
409 
410  const uint32 warp_tid = threadIdx.x & 31;
411 
412  // issue a per-warp atomic
413  uint32 base_index;
414  if (warp_tid == 0)
415  base_index = atomicAdd( dest, warp_count );
416 
417  // compute the per-thread offset
418  *result = cub::ShuffleIndex<32>( base_index, 0, __activemask() ) + warp_scan;
419  }
420 
428  __device__ __forceinline__
429  static void sub(uint32 n, uint32* dest, uint32* result, temp_storage_type& temp_storage)
430  {
431  uint32 warp_scan, warp_count;
432 
433  // issue a warp-scan
434  cub::WarpScan<uint32>(temp_storage.scan_storage).ExclusiveSum(n, warp_scan, warp_count);
435 
436  const uint32 warp_tid = threadIdx.x & 31;
437 
438  // issue a per-warp atomic
439  uint32 base_index;
440  if (warp_tid == 0)
441  base_index = atomicSub( dest, warp_count );
442 
443  // compute the per-thread offset
444  *result = cub::ShuffleIndex<32>( base_index, 0, __activemask() ) - warp_scan;
445  }
446 
451  template <uint32 N>
452  __device__ __forceinline__
453  static void static_add(bool p, uint32* dest)
454  {
455  return warp_static_atomic::static_add<N>( p, dest );
456  }
457 
462  template <uint32 N>
463  __device__ __forceinline__
464  static void static_sub(bool p, uint32* dest)
465  {
466  warp_static_atomic::static_sub<N>( p, dest );
467  }
468 
474  template <uint32 N>
475  __device__ __forceinline__
476  static void static_add(bool p, uint32* dest, uint32* result)
477  {
478  return warp_static_atomic::static_add<N>( p, dest, result );
479  }
480 
486  template <uint32 N>
487  __device__ __forceinline__
488  static void static_sub(bool p, uint32* dest, uint32* result)
489  {
490  warp_static_atomic::static_sub<N>( p, dest, result );
491  }
492 
493 private:
494  uint32* m_dest;
495  temp_storage_type& m_temp_storage;
496 };
497 
501 
502 } // namespace cuda
503 } // namespace cugar
__device__ static __forceinline__ void static_sub(bool p, uint32 *dest, uint32 *result)
Definition: warp_atomics.h:221
__device__ __forceinline__ unsigned int warp_increment(unsigned int *ptr)
Definition: warp_atomics.h:56
__device__ __forceinline__ void sub(bool p, uint32 *result)
Definition: warp_atomics.h:142
__device__ static __forceinline__ void add(uint32 n, uint32 *dest, temp_storage_type &temp_storage)
Definition: warp_atomics.h:366
__device__ __forceinline__ void sub(uint32 n)
Definition: warp_atomics.h:282
__device__ __forceinline__ void sub(uint32 n, uint32 *result)
Definition: warp_atomics.h:330
__device__ __forceinline__ void sub(bool p, uint32 *result)
Definition: warp_atomics.h:352
__device__ static __forceinline__ void static_add(bool p, uint32 *dest, uint32 *result)
Definition: warp_atomics.h:192
__device__ __forceinline__ void add(bool p, uint32 *result)
Definition: warp_atomics.h:132
__device__ __forceinline__ void sub(bool p)
Definition: warp_atomics.h:122
__device__ __forceinline__ warp_static_atomic(uint32 *pool)
Definition: warp_atomics.h:104
__device__ __forceinline__ void add(bool p)
Definition: warp_atomics.h:294
__device__ static __forceinline__ void static_sub(bool p, uint32 *dest, uint32 *result)
Definition: warp_atomics.h:488
__device__ __forceinline__ void add(uint32 n)
Definition: warp_atomics.h:273
__device__ static __forceinline__ void add(uint32 n, uint32 *dest, uint32 *result, temp_storage_type &temp_storage)
Definition: warp_atomics.h:403
__device__ static __forceinline__ void static_add(bool p, uint32 *dest)
Definition: warp_atomics.h:155
__device__ __forceinline__ void add(bool p, uint32 *result)
Definition: warp_atomics.h:341
__device__ static __forceinline__ void static_sub(bool p, uint32 *dest)
Definition: warp_atomics.h:173
__device__ static __forceinline__ void static_sub(bool p, uint32 *dest)
Definition: warp_atomics.h:464
__device__ static __forceinline__ void sub(uint32 n, uint32 *dest, uint32 *result, temp_storage_type &temp_storage)
Definition: warp_atomics.h:429
Define a vector_view POD type and plain_view() for std::vector.
Definition: diff.h:38
__device__ static __forceinline__ void static_add(bool p, uint32 *dest)
Definition: warp_atomics.h:453
Definition: warp_atomics.h:252
Definition: warp_atomics.h:99
__device__ __forceinline__ void sub(bool p)
Definition: warp_atomics.h:306
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE uint32 ffs(const int32 x)
Definition: popcount_inl.h:206
__device__ __forceinline__ void add(uint32 n, uint32 *result)
Definition: warp_atomics.h:318
__device__ __forceinline__ void add(bool p)
Definition: warp_atomics.h:112
Definition: warp_atomics.h:254
__device__ static __forceinline__ void sub(uint32 n, uint32 *dest, temp_storage_type &temp_storage)
Definition: warp_atomics.h:384
__device__ __forceinline__ warp_atomic(uint32 *dest, temp_storage_type &temp_storage)
Definition: warp_atomics.h:265
__device__ static __forceinline__ void static_add(bool p, uint32 *dest, uint32 *result)
Definition: warp_atomics.h:476