Fermat
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Modules Pages
primitives_inl.h
1 /*
2  * CUGAR : Cuda Graphics Accelerator
3  *
4  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #pragma once
30 
31 #include <cugar/basic/numbers.h>
32 
33 namespace cugar {
34 namespace cuda {
35 
36 // make sure a given buffer is big enough
37 //
38 template <typename VectorType>
39 void alloc_temp_storage(VectorType& vec, const uint64 size)
40 {
41  if (vec.size() < size)
42  {
43  vec.clear();
44  vec.resize( size );
45  }
46 }
47 
48 // any kernel
49 //
50 template <typename PredicateIterator>
51 __global__
52 void any_kernel(
53  const uint32 n,
54  const PredicateIterator pred,
55  uint32* r)
56 {
57  const uint32 i = threadIdx.x + blockIdx.x * blockDim.x;
58 
59  const bool p_i = (i < n ? pred[i] : false);
60  const bool p = __syncthreads_or( p_i );
61 
62  // TODO: this could be made faster by using persistent blocks, and early-exiting a
63  // block if this condition is true
64  if (p)
65  *r = 1u;
66 }
67 
68 // all kernel
69 //
70 template <typename PredicateIterator>
71 __global__
72 void all_kernel(
73  const uint32 n,
74  const PredicateIterator pred,
75  uint32* r)
76 {
77  const uint32 i = threadIdx.x + blockIdx.x * blockDim.x;
78 
79  const bool p_i = (i < n ? pred[i] : true);
80  const bool p = __syncthreads_and( p_i );
81 
82  // TODO: this could be made faster by using persistent blocks, and early-exiting a
83  // block if this condition is true
84  if (p == false)
85  *r = 0u;
86 }
87 
88 // return true if any item in the range [0,n) evaluates to true
89 //
90 template <typename PredicateIterator>
91 bool any(
92  const uint32 n,
93  const PredicateIterator pred)
94 {
95  const uint32 block_dim = 256;
96  const uint32 n_blocks = divide_ri( n, block_dim );
97 
98  thrust::device_vector<uint32> r( 1u, 0u );
99 
100  any_kernel<<<n_blocks,block_dim>>>( n, pred, cugar::plain_view( r ) );
101  return r[0] != 0u;
102 }
103 
104 // return true if all items in the range [0,n) evaluate to true
105 //
106 template <typename PredicateIterator>
107 bool all(
108  const uint32 n,
109  const PredicateIterator pred)
110 {
111  const uint32 block_dim = 256;
112  const uint32 n_blocks = divide_ri( n, block_dim );
113 
114  thrust::device_vector<uint32> r( 1u, 1u );
115 
116  all_kernel<<<n_blocks,block_dim>>>( n, pred, cugar::plain_view( r ) );
117  return r[0] != 0u;
118 }
119 
120 // a pseudo-iterator to evaluate the predicate (it1[i] <= it2[i]) for arbitrary iterator pairs
121 //
122 template <typename Iterator1, typename Iterator2>
124 {
125  // constructor
126  CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
127  is_sorted_iterator(const Iterator1 _it1, const Iterator2 _it2) : it1( _it1 ), it2( _it2 ) {}
128 
129  // dereference operator
130  CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
131  bool operator[] (const uint32 i) const { return it1[i] <= it2[i]; }
132 
133  const Iterator1 it1;
134  const Iterator2 it2;
135 };
136 
137 // a pseudo-iterator to evaluate the predicate (hd[i] || (it1[i] <= it2[i])) for arbitrary iterator pairs
138 //
139 template <typename Iterator1, typename Iterator2, typename Headflags>
141 {
142  // constructor
143  CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
144  is_segment_sorted_iterator(const Iterator1 _it1, const Iterator2 _it2, const Headflags _hd) : it1( _it1 ), it2( _it2 ), hd(_hd) {}
145 
146  // dereference operator
147  CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
148  bool operator[] (const uint32 i) const { return hd[i] || (it1[i] <= it2[i]); }
149 
150  const Iterator1 it1;
151  const Iterator2 it2;
152  const Headflags hd;
153 };
154 
155 // return true if the items in the range [0,n) are sorted
156 //
157 template <typename Iterator>
159  const uint32 n,
160  const Iterator values)
161 {
162  return all( n-1, is_sorted_iterator<Iterator,Iterator>( values, values+1 ) );
163 }
164 
165 // return true if the items in the range [0,n) are sorted by segment, where
166 // the beginning of each segment is identified by a set head flag
167 //
168 template <typename Iterator, typename Headflags>
170  const uint32 n,
171  const Iterator values,
172  const Headflags flags)
173 {
174  return all( n-1, is_segment_sorted_iterator<Iterator,Iterator,Headflags>( values, values+1, flags+1 ) );
175 }
176 
177 // device-wide reduce
178 //
179 // \param n number of items to reduce
180 // \param d_in a device iterator
181 // \param op the binary reduction operator
182 // \param d_temp_storage some temporary storage
183 //
184 template <typename InputIterator, typename BinaryOp>
185 typename std::iterator_traits<InputIterator>::value_type reduce(
186  const uint32 n,
187  InputIterator d_in,
188  BinaryOp op,
189  thrust::device_vector<uint8>& d_temp_storage)
190 {
191  typedef typename std::iterator_traits<InputIterator>::value_type value_type;
192 
193  thrust::device_vector<value_type> d_out(1);
194 
195  size_t temp_bytes = 0;
196 
197  cub::DeviceReduce::Reduce(
198  (void*)NULL, temp_bytes,
199  d_in,
200  d_out.begin(),
201  int(n),
202  op,
203  value_type() );
204 
205  temp_bytes = cugar::max( uint64(temp_bytes), uint64(16) );
206  alloc_temp_storage( d_temp_storage, temp_bytes );
207 
208  cub::DeviceReduce::Reduce(
209  (void*)cugar::raw_pointer( d_temp_storage ), temp_bytes,
210  d_in,
211  d_out.begin(),
212  int(n),
213  op,
214  value_type() );
215 
216  return d_out[0];
217 }
218 
219 // device-wide inclusive scan
220 //
221 // \param n number of items to reduce
222 // \param d_in a device input iterator
223 // \param d_out a device output iterator
224 // \param op the binary reduction operator
225 // \param d_temp_storage some temporary storage
226 //
227 template <typename InputIterator, typename OutputIterator, typename BinaryOp>
229  const uint32 n,
230  InputIterator d_in,
231  OutputIterator d_out,
232  BinaryOp op,
233  thrust::device_vector<uint8>& d_temp_storage)
234 {
235  size_t temp_bytes = 0;
236 
237  cub::DeviceScan::InclusiveScan(
238  (void*)NULL, temp_bytes,
239  d_in,
240  d_out,
241  op,
242  int(n) );
243 
244  temp_bytes = cugar::max( uint64(temp_bytes), uint64(16) );
245  alloc_temp_storage( d_temp_storage, temp_bytes );
246 
247  cub::DeviceScan::InclusiveScan(
248  (void*)cugar::raw_pointer( d_temp_storage ), temp_bytes,
249  d_in,
250  d_out,
251  op,
252  int(n) );
253 }
254 
255 // device-wide exclusive scan
256 //
257 // \param n number of items to reduce
258 // \param d_in a device input iterator
259 // \param d_out a device output iterator
260 // \param op the binary reduction operator
261 // \param identity the identity element
262 // \param d_temp_storage some temporary storage
263 //
264 template <typename InputIterator, typename OutputIterator, typename BinaryOp, typename Identity>
266  const uint32 n,
267  InputIterator d_in,
268  OutputIterator d_out,
269  BinaryOp op,
270  Identity identity,
271  thrust::device_vector<uint8>& d_temp_storage)
272 {
273  size_t temp_bytes = 0;
274 
275  cub::DeviceScan::ExclusiveScan(
276  (void*)NULL, temp_bytes,
277  d_in,
278  d_out,
279  op,
280  identity,
281  int(n) );
282 
283  temp_bytes = cugar::max( uint64(temp_bytes), uint64(16) );
284  alloc_temp_storage( d_temp_storage, temp_bytes );
285 
286  cub::DeviceScan::ExclusiveScan(
287  (void*)cugar::raw_pointer( d_temp_storage ), temp_bytes,
288  d_in,
289  d_out,
290  op,
291  identity,
292  int(n) );
293 }
294 
295 // device-wide copy of flagged items
296 //
297 // \param n number of input items
298 // \param d_in a device input iterator
299 // \param d_flags a device flags iterator
300 // \param d_out a device output iterator
301 // \param d_temp_storage some temporary storage
302 //
303 // \return the number of copied items
304 //
305 template <typename InputIterator, typename FlagsIterator, typename OutputIterator>
307  const uint32 n,
308  InputIterator d_in,
309  FlagsIterator d_flags,
310  OutputIterator d_out,
311  thrust::device_vector<uint8>& d_temp_storage)
312 {
313  size_t temp_bytes = 0;
314  thrust::device_vector<int> d_num_selected(1);
315 
316  cub::DeviceSelect::Flagged(
317  (void*)NULL, temp_bytes,
318  d_in,
319  d_flags,
320  d_out,
321  cugar::raw_pointer( d_num_selected ),
322  int(n) );
323 
324  temp_bytes = cugar::max( uint64(temp_bytes), uint64(16) );
325  alloc_temp_storage( d_temp_storage, temp_bytes );
326 
327  cub::DeviceSelect::Flagged(
328  (void*)cugar::raw_pointer( d_temp_storage ), temp_bytes,
329  d_in,
330  d_flags,
331  d_out,
332  cugar::raw_pointer( d_num_selected ),
333  int(n) );
334 
335  return uint32( d_num_selected[0] );
336 };
337 
338 // device-wide copy of predicated items
339 //
340 // \param n number of input items
341 // \param d_in a device input iterator
342 // \param d_out a device output iterator
343 // \param pred a unary predicate functor
344 // \param d_temp_storage some temporary storage
345 //
346 // \return the number of copied items
347 //
348 template <typename InputIterator, typename OutputIterator, typename Predicate>
349 uint32 copy_if(
350  const uint32 n,
351  InputIterator d_in,
352  OutputIterator d_out,
353  const Predicate pred,
354  thrust::device_vector<uint8>& d_temp_storage)
355 {
356  size_t temp_bytes = 0;
357  thrust::device_vector<int> d_num_selected(1);
358 
359  cub::DeviceSelect::If(
360  (void*)NULL, temp_bytes,
361  d_in,
362  d_out,
363  cugar::raw_pointer( d_num_selected ),
364  int(n),
365  pred );
366 
367  temp_bytes = cugar::max( uint64(temp_bytes), uint64(16) );
368  alloc_temp_storage( d_temp_storage, temp_bytes );
369 
370  cub::DeviceSelect::If(
371  (void*)cugar::raw_pointer( d_temp_storage ), temp_bytes,
372  d_in,
373  d_out,
374  cugar::raw_pointer( d_num_selected ),
375  int(n),
376  pred );
377 
378  return uint32( d_num_selected[0] );
379 };
380 
381 // device-wide run-length encode
382 //
383 // \param n number of input items
384 // \param d_in a device input iterator
385 // \param d_out a device output iterator
386 // \param d_counts a device output count iterator
387 // \param d_temp_storage some temporary storage
388 //
389 // \return the number of copied items
390 //
391 template <typename InputIterator, typename OutputIterator, typename CountIterator>
393  const uint32 n,
394  InputIterator d_in,
395  OutputIterator d_out,
396  CountIterator d_counts,
397  thrust::device_vector<uint8>& d_temp_storage)
398 {
399  size_t temp_bytes = 0;
400  thrust::device_vector<int> d_num_selected(1);
401 
402  cub::DeviceRunLengthEncode::Encode(
403  (void*)NULL, temp_bytes,
404  d_in,
405  d_out,
406  d_counts,
407  cugar::raw_pointer( d_num_selected ),
408  int(n) );
409 
410  temp_bytes = cugar::max( uint64(temp_bytes), uint64(16) );
411  alloc_temp_storage( d_temp_storage, temp_bytes );
412 
413  cub::DeviceRunLengthEncode::Encode(
414  (void*)cugar::raw_pointer( d_temp_storage ), temp_bytes,
415  d_in,
416  d_out,
417  d_counts,
418  cugar::raw_pointer( d_num_selected ),
419  int(n) );
420 
421  return uint32( d_num_selected[0] );
422 };
423 
424 
425 // device-wide run-length encode
426 //
427 // \param n number of input items
428 // \param d_keys_in a device input iterator
429 // \param d_values_in a device input iterator
430 // \param d_keys_out a device output iterator
431 // \param d_values_out a device output iterator
432 // \param reduction_op a reduction operator
433 // \param d_temp_storage some temporary storage
434 //
435 // \return the number of copied items
436 //
437 template <typename KeyIterator, typename ValueIterator, typename OutputKeyIterator, typename OutputValueIterator, typename ReductionOp>
439  const uint32 n,
440  KeyIterator d_keys_in,
441  ValueIterator d_values_in,
442  OutputKeyIterator d_keys_out,
443  OutputValueIterator d_values_out,
444  ReductionOp reduction_op,
445  thrust::device_vector<uint8>& d_temp_storage)
446 {
447  size_t temp_bytes = 0;
448  thrust::device_vector<int> d_num_selected(1);
449 
450  cub::DeviceReduce::ReduceByKey(
451  (void*)NULL, temp_bytes,
452  d_keys_in,
453  d_keys_out,
454  d_values_in,
455  d_values_out,
456  cugar::raw_pointer( d_num_selected ),
457  reduction_op,
458  int(n) );
459 
460  temp_bytes = cugar::max( uint64(temp_bytes), uint64(16) );
461  alloc_temp_storage( d_temp_storage, temp_bytes );
462 
463  cub::DeviceReduce::ReduceByKey(
464  (void*)cugar::raw_pointer( d_temp_storage ), temp_bytes,
465  d_keys_in,
466  d_keys_out,
467  d_values_in,
468  d_values_out,
469  cugar::plain_view( d_num_selected ),
470  reduction_op,
471  int(n) );
472 
473  return uint32( d_num_selected[0] );
474 }
475 
476 } // namespace cuda
477 } // namespace cugar
Definition: primitives_inl.h:123
bool any(const uint32 n, const PredicateIterator pred)
Definition: primitives_inl.h:91
vector_view< T *, uint64 > plain_view(thrust::device_vector< T > &vec)
Definition: thrust_view.h:49
void alloc_temp_storage(VectorType &vec, const uint64 size)
Definition: primitives_inl.h:39
T * raw_pointer(thrust::device_vector< T, Alloc > &vec)
Definition: thrust_view.h:69
Definition: primitives_inl.h:140
bool is_sorted(const uint32 n, const Iterator values)
Definition: primitives_inl.h:234
uint32 runlength_encode(const uint32 n, InputIterator in, OutputIterator out, CountIterator counts, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:859
CUGAR_HOST_DEVICE L divide_ri(const L x, const R y)
Definition: numbers.h:180
bool all(const uint32 n, const PredicateIterator pred)
Definition: primitives_inl.h:101
bool is_segment_sorted(const uint32 n, const Iterator values, const Headflags flags)
Definition: primitives_inl.h:245
void exclusive_scan(const uint32 n, InputIterator in, OutputIterator out, BinaryOp op, Identity identity, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:569
uint32 reduce_by_key(const uint32 n, KeyIterator keys_in, ValueIterator values_in, OutputKeyIterator keys_out, OutputValueIterator values_out, ReductionOp reduction_op, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:882
Define a vector_view POD type and plain_view() for std::vector.
Definition: diff.h:38
uint32 copy_if(const uint32 n, InputIterator in, OutputIterator out, const Predicate pred, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:838
std::iterator_traits< InputIterator >::value_type reduce(const uint32 n, InputIterator in, BinaryOp op, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:520
void inclusive_scan(const uint32 n, InputIterator in, OutputIterator out, BinaryOp op, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:543
uint32 copy_flagged(const uint32 n, InputIterator in, FlagsIterator flags, OutputIterator out, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:817