NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
work_queue_persistent_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <nvbio/basic/types.h>
31 #include <nvbio/basic/numbers.h>
32 #include <nvbio/basic/cuda/arch.h>
33 #include <cub/cub.cuh>
34 #include <thrust/copy.h>
35 
36 namespace nvbio {
37 namespace cuda {
38 
41 
42 //
43 // ----------------------- PersistentWarpsQueueTag Implementation -------------------------------------
44 //
45 
46 namespace wq {
47 
50 
51 template <
53  typename WorkUnitT,
54  typename WorkStreamT>
55 __global__
57 {
58  typedef WorkUnitT WorkUnit;
59 
60  // compute the global thread id
61  const uint32 thread_id = threadIdx.x + blockIdx.x*BLOCKDIM;
62 
63  // place a work-unit in local memory
64  WorkUnit unit;
65 
67 
68  // alloc one shared memory integer for each warp
69  __shared__ volatile uint32 sm_broadcast[ NUM_WARPS ];
70 
71  const uint32 stream_size = stream.size();
72  while (1)
73  {
74  // use the first lane of each warp to fetch a new warp's worth of work
75  if (warp_tid() == 0)
76  sm_broadcast[ warp_id() ] = atomicAdd( pool, cuda::Arch::WARP_SIZE );
77 
78  // broadcast the work packet to the entire warp
79  const uint32 work_base_id = sm_broadcast[ warp_id() ];
80 
81  // retire this warp if done
82  if (work_base_id >= stream_size)
83  return;
84 
85  // get the index of the work unit for this thread
86  const uint32 work_id = warp_tid() + work_base_id;
87 
88  if (work_id < stream_size)
89  {
90  // fetch the work unit
91  stats.sample( STREAM_EVENT );
92  stream.get( work_id, &unit, make_uint2( thread_id, 0u ) );
93 
94  // keep an iteration counter
95  uint32 work_iter = 0;
96 
97  // run the unit until completion
98  do { stats.sample( RUN_EVENT ); ++work_iter; } while (unit.run( stream ));
99 
100  // sample the number of iterations this unit has been running
101  stats.sample_iterations( work_iter );
102  }
103  }
104 }
105 
106 } // namespace wq
107 
108 // consume a stream of work units
109 //
110 template <
111  typename WorkUnitT,
113 template <typename WorkStream, typename WorkMover>
115 {
116  // compute the number of blocks we are going to launch
117  const uint32 max_blocks = (uint32)cuda::max_active_blocks( wq::persistent_warps_work_queue_kernel<BLOCKDIM,WorkUnit,WorkStream>, BLOCKDIM, 0u );
118  const uint32 n_blocks = nvbio::max( nvbio::min( max_blocks, m_capacity / BLOCKDIM ), 1u );
119 
120 
121  // resize and reset the work pool counter
122  m_pool.resize(1);
123  m_pool[0] = 0u;
124 
125  // launch the consuming kernel
126  wq::persistent_warps_work_queue_kernel<BLOCKDIM,WorkUnit,WorkStream> <<<n_blocks,BLOCKDIM>>>( thrust::raw_pointer_cast( &m_pool.front() ), stream, view( stats ) );
127 }
128 
129 //
130 // ----------------------- PersistentThreadsQueueTag Implementation ----------------------------------
131 //
132 
133 namespace wq {
134 
135 template <
137  typename WorkUnitT,
138  typename WorkStreamT>
139 __global__
140 void
142 persistent_threads_work_queue_kernel(uint32* pool, const uint32 max_inactive_lanes, const WorkStreamT stream, WorkQueueStats::View stats)
143 {
144  typedef WorkUnitT WorkUnit;
145 
146  // compute the global thread id
147  const uint32 thread_id = threadIdx.x + blockIdx.x*BLOCKDIM;
148 
149  // place a work-unit in local memory
150  WorkUnit unit;
151 
153 
154  // alloc one shared memory integer for each warp
155  __shared__ volatile uint32 sm_broadcast[ NUM_WARPS ];
156 
158 
159  // start all threads with unassigned work units
162  bool active = true;
163 
164  const uint32 stream_size = stream.size();
165  while (__any(active))
166  {
167  // check how many lanes need some new work to do
168  const uint32 pop_mask = __ballot( work_id == invalid_unit );
169  const uint32 pop_count = __popc( pop_mask );
170 
171  // refill this warp only when utilization falls below a certain threshold
172  if (pop_count > max_inactive_lanes)
173  {
174  // use the first lane of each warp to fetch a new warp's worth of work
175  if (warp_tid() == 0)
176  sm_broadcast[ warp_id() ] = atomicAdd( pool, pop_count );
177 
178  // broadcast the work packet to the entire warp
179  const uint32 work_base_id = sm_broadcast[ warp_id() ];
180 
181  // let inactive lanes gather a new work unit
182  if (work_id == invalid_unit)
183  {
184  // compute this lane's exclusive pop scan
185  const uint32 pop_scan = __popc( pop_mask << (cuda::Arch::WARP_SIZE - warp_tid()) );
186 
187  // get the index of the work unit for this thread
188  work_id = pop_scan + work_base_id;
189  work_iter = 0u;
190 
191  // retire this thread if done
192  if (work_id < stream_size)
193  {
194  // fetch the work unit
195  stream.get( work_id, &unit, make_uint2( thread_id, 0u ) );
196  stats.sample( STREAM_EVENT );
197  }
198  else // signal the main loop that, if it was for us, we could stop
199  {
200  active = false;
201  work_id = invalid_unit;
202  }
203  }
204  }
205 
206  if (work_id < stream_size)
207  {
208  ++work_iter;
209 
210  // run the continuation
211  stats.sample( RUN_EVENT );
212  if (unit.run( stream ) == false)
213  {
214  // mark this unit as invalid
215  work_id = invalid_unit;
216 
217  // sample the number of iterations this unit has been running
218  stats.sample_iterations( work_iter );
219  }
220  }
221  }
222 }
223 
225 
226 } // namespace wq
227 
228 // consume a stream of work units
229 //
230 template <
231  typename WorkUnitT,
233 template <typename WorkStream, typename WorkMover>
234 void WorkQueue<PersistentThreadsQueueTag,WorkUnitT,BLOCKDIM>::consume(const WorkStream stream, const WorkMover, WorkQueueStats* stats)
235 {
236  // compute the number of blocks we are going to launch
237  const uint32 max_blocks = (uint32)cuda::max_active_blocks( wq::persistent_threads_work_queue_kernel<BLOCKDIM,WorkUnit,WorkStream>, BLOCKDIM, 0u );
238  const uint32 n_blocks = nvbio::max( nvbio::min( max_blocks, m_capacity / BLOCKDIM ), 1u );
239 
240  // resize and reset the work pool counter
241  m_pool.resize(1);
242  m_pool[0] = 0u;
243 
244  // compute the maximum number of tolerated inactive lanes, given the specified minimum utilization
245  const uint32 min_active_lanes = uint32( m_min_utilization * cuda::Arch::WARP_SIZE );
246  const uint32 max_inactive_lanes = cuda::Arch::WARP_SIZE - min_active_lanes;
247 
248  // launch the consuming kernel
249  wq::persistent_threads_work_queue_kernel<BLOCKDIM,WorkUnit,WorkStream> <<<n_blocks,BLOCKDIM>>>( thrust::raw_pointer_cast( &m_pool.front() ), max_inactive_lanes, stream, view( stats ) );
250 }
251 
253 
254 } // namespace cuda
255 } // namespace nvbio