33 #include <cub/cub.cuh>
34 #include <thrust/scan.h>
57 const uint32 in_queue_size,
61 typedef WorkUnitT WorkUnit;
64 const uint32 queue_capacity = grid_threads * n_tile_grids;
67 WorkUnit* work_queue = context.m_work_queue + queue_capacity*in_queue_id;
68 uint32* continuations = context.m_continuations;
72 uint32 in_work_queue_size = in_queue_size;
77 for (
uint32 i = 0; i < n_tile_grids; ++i)
79 const uint32 work_begin = grid_threads * i;
83 if ((work_begin <= in_work_queue_size) &&
84 (work_begin + grid_threads > in_work_queue_size) &&
85 (stream_begin < stream_end))
87 const uint32 n_loaded =
nvbio::min( stream_end - stream_begin, grid_threads - (in_work_queue_size - work_begin) );
90 if ((work_id >= in_work_queue_size) &&
91 (work_id - in_work_queue_size < n_loaded))
92 stream.get( stream_begin + work_id - in_work_queue_size, work_queue + work_id, make_uint2( work_id, in_queue_id ) );
94 in_work_queue_size += n_loaded;
95 stream_begin += n_loaded;
101 const uint32 n_active_tile_grids = (in_work_queue_size + grid_threads-1) / grid_threads;
104 for (
uint32 i = 0; i < n_active_tile_grids; ++i)
109 if (work_id < in_work_queue_size)
111 const bool has_continuation = work_queue[
work_id ].run( stream );
112 continuations[
work_id ] = has_continuation ? 1u : 0u;
120 typename WorkStreamT>
123 const uint32 n_tile_grids,
126 const uint32 in_queue_size,
130 typedef WorkUnitT WorkUnit;
133 const uint32 queue_capacity = grid_threads * n_tile_grids;
136 WorkUnit* work_queue = context.m_work_queue + queue_capacity*in_queue_id;
140 uint32 in_work_queue_size = in_queue_size;
143 for (
uint32 i = 0; i < n_tile_grids; ++i)
145 const uint32 work_begin = grid_threads * i;
149 if ((work_begin <= in_work_queue_size) &&
150 (work_begin + grid_threads > in_work_queue_size) &&
151 (stream_begin < stream_end))
153 const uint32 n_loaded =
nvbio::min( stream_end - stream_begin, grid_threads - (in_work_queue_size - work_begin) );
156 if ((work_id >= in_work_queue_size) &&
157 (work_id - in_work_queue_size < n_loaded))
158 stream.get( stream_begin + work_id - in_work_queue_size, work_queue + work_id, make_uint2( work_id, in_queue_id ) );
160 in_work_queue_size += n_loaded;
161 stream_begin += n_loaded;
173 const uint32 n_tile_grids,
176 const uint32 in_queue_size,
178 const WorkMover mover)
180 typedef WorkUnitT WorkUnit;
183 const uint32 queue_capacity = grid_threads * n_tile_grids;
186 WorkUnit* in_work_queue = context.m_work_queue + queue_capacity*(in_queue_id ? 1 : 0);
187 WorkUnit* out_work_queue = context.m_work_queue + queue_capacity*(in_queue_id ? 0 : 1);
188 const uint32* continuations = context.m_continuations;
191 const uint32 n_active_tile_grids = (in_queue_size + grid_threads-1) / grid_threads;
194 for (
uint32 i = 0; i < n_active_tile_grids; ++i)
198 if (work_id < in_queue_size)
200 const uint32 prev_slot = work_id ? continuations[ work_id-1 ] : 0u;
202 const bool has_continuation = (next_slot > prev_slot);
204 if (has_continuation)
208 make_uint2( work_id, in_queue_id ? 1 : 0 ), &in_work_queue[ work_id ],
209 make_uint2( prev_slot, in_queue_id ? 0 : 1 ), &out_work_queue[ prev_slot ] );
224 template <
typename WorkStream,
typename WorkMover>
228 const uint32 n_blocks = m_separate_loads ?
233 const uint32 n_tile_grids = m_capacity / grid_threads;
234 const uint32 queue_capacity = grid_threads * n_tile_grids;
236 m_continuations.resize( queue_capacity );
237 m_work_queue.resize( queue_capacity * 2 );
246 typename thrust::device_vector<WorkUnit>::iterator in_queue_begin = m_work_queue.begin();
247 typename thrust::device_vector<WorkUnit>::iterator out_queue_begin = m_work_queue.begin() + queue_capacity;
249 while (in_queue_size || stream_begin < stream_end)
251 const uint32 to_load =
nvbio::min( queue_capacity - in_queue_size, stream_end - stream_begin );
253 if (m_separate_loads)
256 wq::mk_load_kernel<BLOCKDIM,WorkUnit,WorkStream> <<<n_blocks,BLOCKDIM>>>( n_tile_grids, get_context(), in, in_queue_size,
stream, stream_begin );
259 stream_begin += to_load;
262 in_queue_size += to_load;
265 wq::mk_work_queue_kernel<BLOCKDIM,WorkUnit,WorkStream,false> <<<n_blocks,BLOCKDIM>>>( n_tile_grids, get_context(), in, in_queue_size,
stream, stream_begin );
270 wq::mk_work_queue_kernel<BLOCKDIM,WorkUnit,WorkStream,true> <<<n_blocks,BLOCKDIM>>>( n_tile_grids, get_context(), in, in_queue_size,
stream, stream_begin );
273 stream_begin += to_load;
276 in_queue_size += to_load;
281 m_continuations.begin(),
282 m_continuations.begin() + in_queue_size,
283 m_continuations.begin() );
286 cudaDeviceSynchronize();
288 const uint32 out_queue_size = m_continuations[ in_queue_size - 1 ];
291 wq::mk_move_kernel<BLOCKDIM,WorkUnit,WorkStream,WorkMover> <<<n_blocks,BLOCKDIM>>>( n_tile_grids, get_context(), in, in_queue_size,
stream, mover );
295 std::swap( in_queue_begin, out_queue_begin );
297 in_queue_size = out_queue_size;