33 #include <cub/cub.cuh>
34 #include <thrust/copy.h>
58 typedef WorkUnitT WorkUnit;
79 const uint32 work_base_id = sm_broadcast[ warp_id() ];
82 if (work_base_id >= stream_size)
88 if (work_id < stream_size)
92 stream.get( work_id, &unit, make_uint2( thread_id, 0u ) );
113 template <
typename WorkStream,
typename WorkMover>
126 wq::persistent_warps_work_queue_kernel<BLOCKDIM,WorkUnit,WorkStream> <<<n_blocks,
BLOCKDIM>>>( thrust::raw_pointer_cast( &m_pool.front() ), stream,
view( stats ) );
138 typename WorkStreamT>
144 typedef WorkUnitT WorkUnit;
165 while (__any(active))
168 const uint32 pop_mask = __ballot( work_id == invalid_unit );
169 const uint32 pop_count = __popc( pop_mask );
172 if (pop_count > max_inactive_lanes)
176 sm_broadcast[ warp_id() ] = atomicAdd( pool, pop_count );
179 const uint32 work_base_id = sm_broadcast[ warp_id() ];
182 if (work_id == invalid_unit)
188 work_id = pop_scan + work_base_id;
192 if (work_id < stream_size)
195 stream.get( work_id, &unit, make_uint2( thread_id, 0u ) );
206 if (work_id < stream_size)
212 if (unit.run( stream ) ==
false)
218 stats.sample_iterations( work_iter );
233 template <
typename WorkStream,
typename WorkMover>
246 const uint32 max_inactive_lanes = cuda::Arch::WARP_SIZE - min_active_lanes;
249 wq::persistent_threads_work_queue_kernel<BLOCKDIM,WorkUnit,WorkStream> <<<n_blocks,
BLOCKDIM>>>( thrust::raw_pointer_cast( &m_pool.front() ), max_inactive_lanes, stream,
view( stats ) );