NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
primitives_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <nvbio/basic/numbers.h>
31 
32 namespace nvbio {
33 namespace cuda {
34 
35 // make sure a given buffer is big enough
36 //
37 template <typename VectorType>
38 void alloc_temp_storage(VectorType& vec, const uint64 size)
39 {
40  if (vec.size() < size)
41  {
42  try
43  {
44  vec.clear();
45  vec.resize( size );
46  }
47  catch (...)
48  {
49  log_error(stderr,"alloc_temp_storage() : allocation failed! (%llu entries / %llu bytes)\n", size, size * sizeof(typename VectorType::value_type));
50  throw;
51  }
52  }
53 }
54 
55 // any kernel
56 //
57 template <typename PredicateIterator>
58 __global__
60  const uint32 n,
61  const PredicateIterator pred,
62  uint32* r)
63 {
64  const uint32 i = threadIdx.x + blockIdx.x * blockDim.x;
65 
66  const bool p_i = (i < n ? pred[i] : false);
67  const bool p = __syncthreads_or( p_i );
68 
69  // TODO: this could be made faster by using persistent blocks, and early-exiting a
70  // block if this condition is true
71  if (p)
72  *r = 1u;
73 }
74 
75 // all kernel
76 //
77 template <typename PredicateIterator>
78 __global__
80  const uint32 n,
81  const PredicateIterator pred,
82  uint32* r)
83 {
84  const uint32 i = threadIdx.x + blockIdx.x * blockDim.x;
85 
86  const bool p_i = (i < n ? pred[i] : true);
87  const bool p = __syncthreads_and( p_i );
88 
89  // TODO: this could be made faster by using persistent blocks, and early-exiting a
90  // block if this condition is true
91  if (p == false)
92  *r = 0u;
93 }
94 
95 // return true if any item in the range [0,n) evaluates to true
96 //
97 template <typename PredicateIterator>
98 bool any(
99  const uint32 n,
100  const PredicateIterator pred)
101 {
102  const uint32 block_dim = 256;
103  const uint32 n_blocks = util::divide_ri( n, block_dim );
104 
105  thrust::device_vector<uint32> r( 1u, 0u );
106 
107  any_kernel<<<n_blocks,block_dim>>>( n, pred, nvbio::plain_view( r ) );
108  return r[0] != 0u;
109 }
110 
111 // return true if all items in the range [0,n) evaluate to true
112 //
113 template <typename PredicateIterator>
114 bool all(
115  const uint32 n,
116  const PredicateIterator pred)
117 {
118  const uint32 block_dim = 256;
119  const uint32 n_blocks = util::divide_ri( n, block_dim );
120 
121  thrust::device_vector<uint32> r( 1u, 1u );
122 
123  all_kernel<<<n_blocks,block_dim>>>( n, pred, nvbio::plain_view( r ) );
124  return r[0] != 0u;
125 }
126 
127 // a pseudo-iterator to evaluate the predicate (it1[i] <= it2[i]) for arbitrary iterator pairs
128 //
129 template <typename Iterator1, typename Iterator2>
131 {
132  // constructor
134  is_sorted_iterator(const Iterator1 _it1, const Iterator2 _it2) : it1( _it1 ), it2( _it2 ) {}
135 
136  // dereference operator
138  bool operator[] (const uint32 i) const { return it1[i] <= it2[i]; }
139 
140  const Iterator1 it1;
141  const Iterator2 it2;
142 };
143 
144 // a pseudo-iterator to evaluate the predicate (hd[i] || (it1[i] <= it2[i])) for arbitrary iterator pairs
145 //
146 template <typename Iterator1, typename Iterator2, typename Headflags>
148 {
149  // constructor
151  is_segment_sorted_iterator(const Iterator1 _it1, const Iterator2 _it2, const Headflags _hd) : it1( _it1 ), it2( _it2 ), hd(_hd) {}
152 
153  // dereference operator
155  bool operator[] (const uint32 i) const { return hd[i] || (it1[i] <= it2[i]); }
156 
157  const Iterator1 it1;
158  const Iterator2 it2;
159  const Headflags hd;
160 };
161 
162 // return true if the items in the range [0,n) are sorted
163 //
164 template <typename Iterator>
166  const uint32 n,
167  const Iterator values)
168 {
169  return all( n-1, is_sorted_iterator<Iterator,Iterator>( values, values+1 ) );
170 }
171 
172 // return true if the items in the range [0,n) are sorted by segment, where
173 // the beginning of each segment is identified by a set head flag
174 //
175 template <typename Iterator, typename Headflags>
177  const uint32 n,
178  const Iterator values,
179  const Headflags flags)
180 {
181  return all( n-1, is_segment_sorted_iterator<Iterator,Iterator,Headflags>( values, values+1, flags+1 ) );
182 }
183 
184 // device-wide reduce
185 //
186 // \param n number of items to reduce
187 // \param d_in a device iterator
188 // \param op the binary reduction operator
189 // \param d_temp_storage some temporary storage
190 //
191 template <typename InputIterator, typename BinaryOp>
192 typename std::iterator_traits<InputIterator>::value_type reduce(
193  const uint32 n,
194  InputIterator d_in,
195  BinaryOp op,
196  thrust::device_vector<uint8>& d_temp_storage)
197 {
198  typedef typename std::iterator_traits<InputIterator>::value_type value_type;
199 
200  thrust::device_vector<value_type> d_out(1);
201 
202  size_t temp_bytes = 0;
203 
204  cub::DeviceReduce::Reduce(
205  (void*)NULL, temp_bytes,
206  d_in,
207  d_out.begin(),
208  int(n),
209  op );
210 
211  temp_bytes = nvbio::max( uint64(temp_bytes), uint64(16) );
212  alloc_temp_storage( d_temp_storage, temp_bytes );
213 
214  cub::DeviceReduce::Reduce(
215  (void*)nvbio::plain_view( d_temp_storage ), temp_bytes,
216  d_in,
217  d_out.begin(),
218  int(n),
219  op );
220 
221  return d_out[0];
222 }
223 
224 // device-wide inclusive scan
225 //
226 // \param n number of items to reduce
227 // \param d_in a device input iterator
228 // \param d_out a device output iterator
229 // \param op the binary reduction operator
230 // \param d_temp_storage some temporary storage
231 //
232 template <typename InputIterator, typename OutputIterator, typename BinaryOp>
234  const uint32 n,
235  InputIterator d_in,
236  OutputIterator d_out,
237  BinaryOp op,
238  thrust::device_vector<uint8>& d_temp_storage)
239 {
240  size_t temp_bytes = 0;
241 
242  cub::DeviceScan::InclusiveScan(
243  (void*)NULL, temp_bytes,
244  d_in,
245  d_out,
246  op,
247  int(n) );
248 
249  temp_bytes = nvbio::max( uint64(temp_bytes), uint64(16) );
250  alloc_temp_storage( d_temp_storage, temp_bytes );
251 
252  cub::DeviceScan::InclusiveScan(
253  (void*)nvbio::plain_view( d_temp_storage ), temp_bytes,
254  d_in,
255  d_out,
256  op,
257  int(n) );
258 }
259 
260 // device-wide exclusive scan
261 //
262 // \param n number of items to reduce
263 // \param d_in a device input iterator
264 // \param d_out a device output iterator
265 // \param op the binary reduction operator
266 // \param identity the identity element
267 // \param d_temp_storage some temporary storage
268 //
269 template <typename InputIterator, typename OutputIterator, typename BinaryOp, typename Identity>
271  const uint32 n,
272  InputIterator d_in,
273  OutputIterator d_out,
274  BinaryOp op,
275  Identity identity,
276  thrust::device_vector<uint8>& d_temp_storage)
277 {
278  size_t temp_bytes = 0;
279 
280  cub::DeviceScan::ExclusiveScan(
281  (void*)NULL, temp_bytes,
282  d_in,
283  d_out,
284  op,
285  identity,
286  int(n) );
287 
288  temp_bytes = nvbio::max( uint64(temp_bytes), uint64(16) );
289  alloc_temp_storage( d_temp_storage, temp_bytes );
290 
291  cub::DeviceScan::ExclusiveScan(
292  (void*)nvbio::plain_view( d_temp_storage ), temp_bytes,
293  d_in,
294  d_out,
295  op,
296  identity,
297  int(n) );
298 }
299 
300 // device-wide copy of flagged items
301 //
302 // \param n number of input items
303 // \param d_in a device input iterator
304 // \param d_flags a device flags iterator
305 // \param d_out a device output iterator
306 // \param d_temp_storage some temporary storage
307 //
308 // \return the number of copied items
309 //
310 template <typename InputIterator, typename FlagsIterator, typename OutputIterator>
312  const uint32 n,
313  InputIterator d_in,
314  FlagsIterator d_flags,
315  OutputIterator d_out,
316  thrust::device_vector<uint8>& d_temp_storage)
317 {
318  size_t temp_bytes = 0;
319  thrust::device_vector<int> d_num_selected(1);
320 
321  cub::DeviceSelect::Flagged(
322  (void*)NULL, temp_bytes,
323  d_in,
324  d_flags,
325  d_out,
326  nvbio::plain_view( d_num_selected ),
327  int(n) );
328 
329  temp_bytes = nvbio::max( uint64(temp_bytes), uint64(16) );
330  alloc_temp_storage( d_temp_storage, temp_bytes );
331 
332  cub::DeviceSelect::Flagged(
333  (void*)nvbio::plain_view( d_temp_storage ), temp_bytes,
334  d_in,
335  d_flags,
336  d_out,
337  nvbio::plain_view( d_num_selected ),
338  int(n) );
339 
340  return uint32( d_num_selected[0] );
341 };
342 
343 // device-wide copy of predicated items
344 //
345 // \param n number of input items
346 // \param d_in a device input iterator
347 // \param d_out a device output iterator
348 // \param pred a unary predicate functor
349 // \param d_temp_storage some temporary storage
350 //
351 // \return the number of copied items
352 //
353 template <typename InputIterator, typename OutputIterator, typename Predicate>
355  const uint32 n,
356  InputIterator d_in,
357  OutputIterator d_out,
358  const Predicate pred,
359  thrust::device_vector<uint8>& d_temp_storage)
360 {
361  size_t temp_bytes = 0;
362  thrust::device_vector<int> d_num_selected(1);
363 
364  cub::DeviceSelect::If(
365  (void*)NULL, temp_bytes,
366  d_in,
367  d_out,
368  nvbio::plain_view( d_num_selected ),
369  int(n),
370  pred );
371 
372  temp_bytes = nvbio::max( uint64(temp_bytes), uint64(16) );
373  alloc_temp_storage( d_temp_storage, temp_bytes );
374 
375  cub::DeviceSelect::If(
376  (void*)nvbio::plain_view( d_temp_storage ), temp_bytes,
377  d_in,
378  d_out,
379  nvbio::plain_view( d_num_selected ),
380  int(n),
381  pred );
382 
383  return uint32( d_num_selected[0] );
384 };
385 
386 // device-wide run-length encode
387 //
388 // \param n number of input items
389 // \param d_in a device input iterator
390 // \param d_out a device output iterator
391 // \param d_counts a device output count iterator
392 // \param d_temp_storage some temporary storage
393 //
394 // \return the number of copied items
395 //
396 template <typename InputIterator, typename OutputIterator, typename CountIterator>
398  const uint32 n,
399  InputIterator d_in,
400  OutputIterator d_out,
401  CountIterator d_counts,
402  thrust::device_vector<uint8>& d_temp_storage)
403 {
404  size_t temp_bytes = 0;
405  thrust::device_vector<int> d_num_selected(1);
406 
407  cub::DeviceReduce::RunLengthEncode(
408  (void*)NULL, temp_bytes,
409  d_in,
410  d_out,
411  d_counts,
412  nvbio::plain_view( d_num_selected ),
413  int(n) );
414 
415  temp_bytes = nvbio::max( uint64(temp_bytes), uint64(16) );
416  alloc_temp_storage( d_temp_storage, temp_bytes );
417 
418  cub::DeviceReduce::RunLengthEncode(
419  (void*)nvbio::plain_view( d_temp_storage ), temp_bytes,
420  d_in,
421  d_out,
422  d_counts,
423  nvbio::plain_view( d_num_selected ),
424  int(n) );
425 
426  return uint32( d_num_selected[0] );
427 };
428 
429 
430 // device-wide run-length encode
431 //
432 // \param n number of input items
433 // \param d_keys_in a device input iterator
434 // \param d_values_in a device input iterator
435 // \param d_keys_out a device output iterator
436 // \param d_values_out a device output iterator
437 // \param reduction_op a reduction operator
438 // \param d_temp_storage some temporary storage
439 //
440 // \return the number of copied items
441 //
442 template <typename KeyIterator, typename ValueIterator, typename OutputKeyIterator, typename OutputValueIterator, typename ReductionOp>
444  const uint32 n,
445  KeyIterator d_keys_in,
446  ValueIterator d_values_in,
447  OutputKeyIterator d_keys_out,
448  OutputValueIterator d_values_out,
449  ReductionOp reduction_op,
450  thrust::device_vector<uint8>& d_temp_storage)
451 {
452  size_t temp_bytes = 0;
453  thrust::device_vector<int> d_num_selected(1);
454 
455  cub::DeviceReduce::ReduceByKey(
456  (void*)NULL, temp_bytes,
457  d_keys_in,
458  d_keys_out,
459  d_values_in,
460  d_values_out,
461  nvbio::plain_view( d_num_selected ),
462  reduction_op,
463  int(n) );
464 
465  temp_bytes = nvbio::max( uint64(temp_bytes), uint64(16) );
466  alloc_temp_storage( d_temp_storage, temp_bytes );
467 
468  cub::DeviceReduce::ReduceByKey(
469  (void*)nvbio::plain_view( d_temp_storage ), temp_bytes,
470  d_keys_in,
471  d_keys_out,
472  d_values_in,
473  d_values_out,
474  nvbio::plain_view( d_num_selected ),
475  reduction_op,
476  int(n) );
477 
478  return uint32( d_num_selected[0] );
479 }
480 
481 } // namespace cuda
482 } // namespace nvbio