NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
work_queue_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <nvbio/basic/types.h>
31 #include <nvbio/basic/numbers.h>
32 #include <nvbio/basic/cuda/arch.h>
33 #include <cub/cub.cuh>
34 #include <thrust/copy.h>
35 
36 namespace nvbio {
37 namespace cuda {
38 
39 namespace wq {
40 
43 
46 
47 template <
49  typename WorkUnitT,
50  typename WorkStreamT>
51 __global__
53 {
54  typedef WorkUnitT WorkUnit;
55 
56  const uint32 grid_threads = gridDim.x * BLOCKDIM;
57  const uint32 thread_id = threadIdx.x + blockIdx.x*BLOCKDIM;
58 
59  // place a work-unit in local memory
60  WorkUnit unit;
61 
62  const uint32 stream_end = stream.size();
63 
64  // let this CTA fetch all tiles at a grid-threads stride, starting from blockIdx.x*BLOCKDIM
65  for (uint32 stream_begin = 0; stream_begin < stream_end; stream_begin += grid_threads)
66  {
67  const uint32 work_id = thread_id + stream_begin;
68 
69  if (work_id < stream_end)
70  {
71  // fetch the work unit
72  stream.get( work_id, &unit, make_uint2( thread_id, 0u ) );
73  stats.sample( STREAM_EVENT );
74 
75  // keep an iteration counter
76  uint32 work_iter = 0;
77 
78  // run the unit until completion
79  do { stats.sample( RUN_EVENT ); ++work_iter; } while (unit.run( stream ));
80 
81  // sample the number of iterations this unit has been running
82  stats.sample_iterations( work_iter );
83  }
84  }
85 }
86 
88 
89 } // namespace wq
90 
91 // consume a stream of work units
92 //
93 template <
94  typename PolicyTag,
95  typename WorkUnitT,
97 template <typename WorkStream, typename WorkMover>
99 {
100  // compute the number of blocks we are going to launch
101  const uint32 n_blocks = (uint32)cuda::max_active_blocks( wq::inplace_work_queue_kernel<BLOCKDIM,WorkUnit,WorkStream>, BLOCKDIM, 0u );
102 
103  // launch the consuming kernel
104  wq::inplace_work_queue_kernel<BLOCKDIM,WorkUnit,WorkStream> <<<n_blocks,BLOCKDIM>>>( stream, view( stats ) );
105 }
106 
107 
108 // sample utilization
109 //
112 {
113  if (valid() == false)
114  return;
115 
116  const uint32 active_mask = __ballot(true);
117  const uint32 active_count = __popc(active_mask);
118  if (__popc(active_mask >> warp_tid()) == 1u)
119  {
120  atomicAdd( active_lanes + type, active_count );
121  atomicAdd( issued_warps + type, 1u );
122  };
123 }
124 
125 // sample iterations
126 //
129 {
130  if (valid() == false)
131  return;
132 
133  atomicAdd( iterations, i ); // add to the total sum
134  atomicMax( (uint32*)(iterations+1u), i ); // take the maximum
135  atomicAdd( iterations+2u, 1u ); // increase the event counter
136 }
137 
139 
140 } // namespace cuda
141 } // namespace nvbio