NVBIO
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
work_queue_multipass_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <nvbio/basic/types.h>
31 #include <nvbio/basic/numbers.h>
32 #include <nvbio/basic/cuda/arch.h>
33 #include <cub/cub.cuh>
34 #include <thrust/scan.h>
35 
36 namespace nvbio {
37 namespace cuda {
38 
41 
42 namespace wq {
43 
46 
47 template <
49  typename WorkUnitT,
50  typename WorkStreamT,
51  bool DO_LOADS>
52 __global__
54  const uint32 n_tile_grids,
56  const uint32 in_queue_id,
57  const uint32 in_queue_size,
58  const WorkStreamT stream,
59  uint32 stream_begin)
60 {
61  typedef WorkUnitT WorkUnit;
62 
63  const uint32 grid_threads = gridDim.x * BLOCKDIM;
64  const uint32 queue_capacity = grid_threads * n_tile_grids;
65  const uint32 thread_id = threadIdx.x + blockIdx.x*BLOCKDIM;
66 
67  WorkUnit* work_queue = context.m_work_queue + queue_capacity*in_queue_id;
68  uint32* continuations = context.m_continuations;
69 
70  const uint32 stream_end = stream.size();
71 
72  uint32 in_work_queue_size = in_queue_size;
73 
74  if (DO_LOADS)
75  {
76  // try to load more work to do
77  for (uint32 i = 0; i < n_tile_grids; ++i)
78  {
79  const uint32 work_begin = grid_threads * i;
80  const uint32 work_id = thread_id + work_begin;
81 
82  // if the work queue is not full, and there's remaining work in the stream, fetch some more
83  if ((work_begin <= in_work_queue_size) &&
84  (work_begin + grid_threads > in_work_queue_size) &&
85  (stream_begin < stream_end))
86  {
87  const uint32 n_loaded = nvbio::min( stream_end - stream_begin, grid_threads - (in_work_queue_size - work_begin) );
88 
89  // fetch a new work unit
90  if ((work_id >= in_work_queue_size) &&
91  (work_id - in_work_queue_size < n_loaded))
92  stream.get( stream_begin + work_id - in_work_queue_size, work_queue + work_id, make_uint2( work_id, in_queue_id ) );
93 
94  in_work_queue_size += n_loaded;
95  stream_begin += n_loaded;
96  }
97  }
98  }
99 
100  // compute how many tiles are needed to cover the input queue
101  const uint32 n_active_tile_grids = (in_work_queue_size + grid_threads-1) / grid_threads;
102 
103  // execute the queued work units
104  for (uint32 i = 0; i < n_active_tile_grids; ++i)
105  {
106  const uint32 work_id = thread_id + grid_threads * i;
107 
108  // execute this thread's work unit
109  if (work_id < in_work_queue_size)
110  {
111  const bool has_continuation = work_queue[ work_id ].run( stream );
112  continuations[ work_id ] = has_continuation ? 1u : 0u;
113  }
114  }
115 }
116 
117 template <
119  typename WorkUnitT,
120  typename WorkStreamT>
121 __global__
123  const uint32 n_tile_grids,
125  const uint32 in_queue_id,
126  const uint32 in_queue_size,
127  const WorkStreamT stream,
128  uint32 stream_begin)
129 {
130  typedef WorkUnitT WorkUnit;
131 
132  const uint32 grid_threads = gridDim.x * BLOCKDIM;
133  const uint32 queue_capacity = grid_threads * n_tile_grids;
134  const uint32 thread_id = threadIdx.x + blockIdx.x*BLOCKDIM;
135 
136  WorkUnit* work_queue = context.m_work_queue + queue_capacity*in_queue_id;
137 
138  const uint32 stream_end = stream.size();
139 
140  uint32 in_work_queue_size = in_queue_size;
141 
142  // try to load more work to do
143  for (uint32 i = 0; i < n_tile_grids; ++i)
144  {
145  const uint32 work_begin = grid_threads * i;
146  const uint32 work_id = thread_id + work_begin;
147 
148  // if the work queue is not full, and there's remaining work in the stream, fetch some more
149  if ((work_begin <= in_work_queue_size) &&
150  (work_begin + grid_threads > in_work_queue_size) &&
151  (stream_begin < stream_end))
152  {
153  const uint32 n_loaded = nvbio::min( stream_end - stream_begin, grid_threads - (in_work_queue_size - work_begin) );
154 
155  // fetch a new work unit
156  if ((work_id >= in_work_queue_size) &&
157  (work_id - in_work_queue_size < n_loaded))
158  stream.get( stream_begin + work_id - in_work_queue_size, work_queue + work_id, make_uint2( work_id, in_queue_id ) );
159 
160  in_work_queue_size += n_loaded;
161  stream_begin += n_loaded;
162  }
163  }
164 }
165 
166 template <
168  typename WorkUnitT,
169  typename WorkStream,
170  typename WorkMover>
171 __global__
173  const uint32 n_tile_grids,
175  const uint32 in_queue_id,
176  const uint32 in_queue_size,
177  const WorkStream stream,
178  const WorkMover mover)
179 {
180  typedef WorkUnitT WorkUnit;
181 
182  const uint32 grid_threads = gridDim.x * BLOCKDIM;
183  const uint32 queue_capacity = grid_threads * n_tile_grids;
184  const uint32 thread_id = threadIdx.x + blockIdx.x*BLOCKDIM;
185 
186  WorkUnit* in_work_queue = context.m_work_queue + queue_capacity*(in_queue_id ? 1 : 0);
187  WorkUnit* out_work_queue = context.m_work_queue + queue_capacity*(in_queue_id ? 0 : 1);
188  const uint32* continuations = context.m_continuations;
189 
190  // compute how many tiles are needed to cover the input queue
191  const uint32 n_active_tile_grids = (in_queue_size + grid_threads-1) / grid_threads;
192 
193  // move the continuations from the input queue to the output queue
194  for (uint32 i = 0; i < n_active_tile_grids; ++i)
195  {
196  const uint32 work_id = thread_id + grid_threads * i;
197 
198  if (work_id < in_queue_size)
199  {
200  const uint32 prev_slot = work_id ? continuations[ work_id-1 ] : 0u;
201  const uint32 next_slot = continuations[ work_id ];
202  const bool has_continuation = (next_slot > prev_slot);
203 
204  if (has_continuation)
205  {
206  // move the work-unit
207  mover.move( stream,
208  make_uint2( work_id, in_queue_id ? 1 : 0 ), &in_work_queue[ work_id ],
209  make_uint2( prev_slot, in_queue_id ? 0 : 1 ), &out_work_queue[ prev_slot ] );
210  }
211  }
212  }
213 }
214 
216 
217 } // namespace wq
218 
219 // consume a stream of work units
220 //
221 template <
222  typename WorkUnitT,
224 template <typename WorkStream, typename WorkMover>
226 {
227  // compute the number of blocks we are going to launch
228  const uint32 n_blocks = m_separate_loads ?
229  (uint32)cuda::max_active_blocks( wq::mk_work_queue_kernel<BLOCKDIM,WorkUnit,WorkStream,false>, BLOCKDIM, 0u ) :
230  (uint32)cuda::max_active_blocks( wq::mk_work_queue_kernel<BLOCKDIM,WorkUnit,WorkStream,true>, BLOCKDIM, 0u );
231  const uint32 grid_threads = n_blocks * BLOCKDIM;
232 
233  const uint32 n_tile_grids = m_capacity / grid_threads;
234  const uint32 queue_capacity = grid_threads * n_tile_grids;
235 
236  m_continuations.resize( queue_capacity );
237  m_work_queue.resize( queue_capacity * 2 ); // alloc space for an input and an output queue
238 
239  const uint32 stream_size = stream.size();
240 
241  uint32 in = 0;
242  uint32 in_queue_size = 0;
243  uint32 stream_begin = 0;
245 
246  typename thrust::device_vector<WorkUnit>::iterator in_queue_begin = m_work_queue.begin();
247  typename thrust::device_vector<WorkUnit>::iterator out_queue_begin = m_work_queue.begin() + queue_capacity;
248 
249  while (in_queue_size || stream_begin < stream_end)
250  {
251  const uint32 to_load = nvbio::min( queue_capacity - in_queue_size, stream_end - stream_begin );
252 
253  if (m_separate_loads)
254  {
255  // launch the loading kernel
256  wq::mk_load_kernel<BLOCKDIM,WorkUnit,WorkStream> <<<n_blocks,BLOCKDIM>>>( n_tile_grids, get_context(), in, in_queue_size, stream, stream_begin );
257 
258  // update stream pointer
259  stream_begin += to_load;
260 
261  // update input queue size
262  in_queue_size += to_load;
263 
264  // launch the consuming kernel
265  wq::mk_work_queue_kernel<BLOCKDIM,WorkUnit,WorkStream,false> <<<n_blocks,BLOCKDIM>>>( n_tile_grids, get_context(), in, in_queue_size, stream, stream_begin );
266  }
267  else
268  {
269  // launch the consuming kernel
270  wq::mk_work_queue_kernel<BLOCKDIM,WorkUnit,WorkStream,true> <<<n_blocks,BLOCKDIM>>>( n_tile_grids, get_context(), in, in_queue_size, stream, stream_begin );
271 
272  // update stream pointer
273  stream_begin += to_load;
274 
275  // update input queue size
276  in_queue_size += to_load;
277  }
278 
279  // compact the output
281  m_continuations.begin(),
282  m_continuations.begin() + in_queue_size,
283  m_continuations.begin() );
284 
285  // synchronize on the scan
286  cudaDeviceSynchronize();
287 
288  const uint32 out_queue_size = m_continuations[ in_queue_size - 1 ];
289 
290  // launch the movement kernel
291  wq::mk_move_kernel<BLOCKDIM,WorkUnit,WorkStream,WorkMover> <<<n_blocks,BLOCKDIM>>>( n_tile_grids, get_context(), in, in_queue_size, stream, mover );
292 
293  // swap queue pointers
294  in = in ? 0u : 1u;
295  std::swap( in_queue_begin, out_queue_begin );
296 
297  in_queue_size = out_queue_size;
298  }
299 }
300 
302 
303 } // namespace cuda
304 } // namespace nvbio