Fermat
primitives_inl.h
1 /*
2  * cugar
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 namespace cugar {
31 
32 // return true if any item in the range [0,n) evaluates to true
33 //
34 template <typename PredicateIterator>
35 bool any(
36  const host_tag tag,
37  const uint32 n,
38  const PredicateIterator pred)
39 {
40  return thrust::reduce(
41  pred,
42  pred + n,
43  false,
44  thrust::logical_or<bool>() );
45 }
46 
47 // return true if all items in the range [0,n) evaluate to true
48 //
49 template <typename PredicateIterator>
50 bool all(
51  const host_tag tag,
52  const uint32 n,
53  const PredicateIterator pred)
54 {
55  return thrust::reduce(
56  pred,
57  pred + n,
58  true,
59  thrust::logical_and<bool>() );
60 }
61 
62 #if defined(__CUDACC__)
63 
64 // return true if any item in the range [0,n) evaluates to true
65 //
66 template <typename PredicateIterator>
67 bool any(
68  const device_tag tag,
69  const uint32 n,
70  const PredicateIterator pred)
71 {
72  return cuda::any( n, pred );
73 }
74 
75 // return true if any item in the range [0,n) evaluates to true
76 //
77 template <typename PredicateIterator>
78 bool all(
79  const device_tag tag,
80  const uint32 n,
81  const PredicateIterator pred)
82 {
83  return cuda::all( n, pred );
84 }
85 
86 #endif
87 
88 // return true if any item in the range [0,n) evaluates to true
89 //
90 template <typename system_tag, typename PredicateIterator>
91 bool any(
92  const uint32 n,
93  const PredicateIterator pred)
94 {
95  return any( system_tag(), n, pred );
96 }
97 
98 // return true if all items in the range [0,n) evaluate to true
99 //
100 template <typename system_tag, typename PredicateIterator>
101 bool all(
102  const uint32 n,
103  const PredicateIterator pred)
104 {
105  return all( system_tag(), n, pred );
106 }
107 
108 // a pseudo-iterator to evaluate the predicate (it1[i] <= it2[i]) for arbitrary iterator pairs
109 //
110 template <typename Iterator1, typename Iterator2>
112 {
113  typedef bool value_type;
114  typedef value_type& reference;
115  typedef value_type const_reference;
116  typedef value_type* pointer;
117  typedef typename std::iterator_traits<Iterator1>::difference_type difference_type;
118  typedef typename std::iterator_traits<Iterator1>::iterator_category iterator_category;
119 
120  // constructor
121  CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
122  is_sorted_iterator(const Iterator1 _it1, const Iterator2 _it2) : it1( _it1 ), it2( _it2 ) {}
123 
124  // dereference operator
125  CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
126  bool operator[] (const uint64 i) const { return it1[i] <= it2[i]; }
127 
128  // dereference operator
129  CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
130  bool operator* () const { return it1[0] <= it2[0]; }
131 
132  // dereference operator
133  CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
134  is_sorted_iterator& operator++ () { ++it1; ++it2; return *this; }
135 
136  Iterator1 it1;
137  Iterator2 it2;
138 };
139 
140 // operator+
141 template <typename T1, typename T2>
142 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
143 is_sorted_iterator<T1,T2> operator+ (const is_sorted_iterator<T1,T2> it, const int64 i)
144 {
145  return is_sorted_iterator<T1,T2>( it.it1 + i, it.it2 + i );
146 }
147 // operator-
148 template <typename T1, typename T2>
149 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
150 int64 operator- (const is_sorted_iterator<T1,T2> it1, const is_sorted_iterator<T1,T2> it2)
151 {
152  return it1.it1 - it2.it1;
153 }
154 // operator!=
155 template <typename T1, typename T2>
156 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
157 bool operator!= (const is_sorted_iterator<T1,T2> it1, const is_sorted_iterator<T1,T2> it2)
158 {
159  return it1.it1 != it2.it1;
160 }
161 // operator==
162 template <typename T1, typename T2>
163 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
164 bool operator== (const is_sorted_iterator<T1,T2> it1, const is_sorted_iterator<T1,T2> it2)
165 {
166  return it1.it1 == it2.it1;
167 }
168 
169 // a pseudo-iterator to evaluate the predicate (hd[i] || (it1[i] <= it2[i])) for arbitrary iterator pairs
170 //
171 template <typename Iterator1, typename Iterator2, typename Headflags>
173 {
174  typedef bool value_type;
175  typedef value_type& reference;
176  typedef value_type const_reference;
177  typedef value_type* pointer;
178  typedef typename std::iterator_traits<Iterator1>::difference_type difference_type;
179  typedef typename std::iterator_traits<Iterator1>::iterator_category iterator_category;
180 
181  // constructor
182  CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
183  is_segment_sorted_iterator(const Iterator1 _it1, const Iterator2 _it2, const Headflags _hd) : it1( _it1 ), it2( _it2 ), hd(_hd) {}
184 
185  // dereference operator
186  CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
187  bool operator[] (const uint64 i) const { return hd[i] || (it1[i] <= it2[i]); }
188 
189  // dereference operator
190  CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
191  bool operator* () const { return hd[0] || (it1[0] <= it2[0]); }
192 
193  // dereference operator
194  CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
195  is_segment_sorted_iterator& operator++ () { ++it1; ++it2; ++hd; return *this; }
196 
197  Iterator1 it1;
198  Iterator2 it2;
199  Headflags hd;
200 };
201 
202 // operator+
203 template <typename T1, typename T2, typename H>
204 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
206 {
207  return is_segment_sorted_iterator<T1,T2,H>( it.it1 + i, it.it2 + i, it.hd + i );
208 }
209 // operator-
210 template <typename T1, typename T2, typename H>
211 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
213 {
214  return it1.it1 - it2.it1;
215 }
216 // operator!=
217 template <typename T1, typename T2, typename H>
218 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
220 {
221  return it1.it1 != it2.it1;
222 }
223 // operator==
224 template <typename T1, typename T2, typename H>
225 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
227 {
228  return it1.it1 == it2.it1;
229 }
230 
231 // return true if the items in the range [0,n) are sorted
232 //
233 template <typename system_tag, typename Iterator>
235  const uint32 n,
236  const Iterator values)
237 {
238  return all<system_tag>( n-1, is_sorted_iterator<Iterator,Iterator>( values, values+1 ) );
239 }
240 
241 // return true if the items in the range [0,n) are sorted by segment, where
242 // the beginning of each segment is identified by a set head flag
243 //
244 template <typename system_tag, typename Iterator, typename Headflags>
246  const uint32 n,
247  const Iterator values,
248  const Headflags flags)
249 {
250  return all<system_tag>( n-1, is_segment_sorted_iterator<Iterator,Iterator,Headflags>( values, values+1, flags+1 ) );
251 }
252 
253 // invoke a functor for each element of the given sequence
254 //
255 template <typename Iterator, typename Functor>
256 void for_each(
257  const host_tag tag,
258  const uint64 n,
259  const Iterator in,
260  Functor functor)
261 {
262  #if defined(_OPENMP)
263  #pragma omp parallel for if (n >= 256)
264  #endif
265  for (int64 i = 0; i < int64(n); ++i)
266  functor( in[i] );
267 }
268 
269 // invoke a functor for each element of the given sequence
270 //
271 template <typename Iterator, typename Functor>
272 void for_each(
273  const device_tag tag,
274  const uint64 n,
275  const Iterator in,
276  Functor functor)
277 {
278  thrust::for_each( in, in + n, functor );
279 }
280 
281 // invoke a functor for each element of the given sequence
282 //
283 template <typename system_tag, typename Iterator, typename Functor>
284 void for_each(
285  const uint64 n,
286  const Iterator in,
287  Functor functor)
288 {
289  return for_each( system_tag(), n, in, functor );
290 }
291 
292 // apply a functor to each element of the given sequence
293 //
294 template <typename Iterator, typename Output, typename Functor>
295 void transform(
296  const device_tag tag,
297  const uint64 n,
298  const Iterator in,
299  const Output out,
300  const Functor functor)
301 {
302  thrust::transform( in, in + n, out, functor );
303 }
304 
305 // apply a functor to each element of the given sequence
306 //
307 template <typename Iterator, typename Output, typename Functor>
308 void transform(
309  const host_tag tag,
310  const uint32 n,
311  const Iterator in,
312  const Output out,
313  const Functor functor)
314 {
315  #if defined(_OPENMP)
316  #pragma omp parallel for if (n >= 256)
317  #endif
318  for (int64 i = 0; i < int64(n); ++i)
319  out[i] = functor( in[i] );
320 }
321 
322 // apply a binary functor to each pair of elements of the given sequences
323 //
324 template <typename Iterator1, typename Iterator2, typename Output, typename Functor>
325 void transform(
326  const device_tag tag,
327  const uint32 n,
328  const Iterator1 in1,
329  const Iterator2 in2,
330  const Output out,
331  const Functor functor)
332 {
333  thrust::transform( in1, in1 + n, in2, out, functor );
334 }
335 
336 // apply a binary functor to each pair of elements of the given sequences
337 //
338 template <typename Iterator1, typename Iterator2, typename Output, typename Functor>
339 void transform(
340  const host_tag tag,
341  const uint32 n,
342  const Iterator1 in1,
343  const Iterator2 in2,
344  const Output out,
345  const Functor functor)
346 {
347  #if defined(_OPENMP)
348  #pragma omp parallel for if (n >= 256)
349  #endif
350  for (int64 i = 0; i < int64(n); ++i)
351  out[i] = functor( in1[i], in2[i] );
352 }
353 
354 // apply a functor to each element of the given sequence
355 //
356 template <typename system_tag, typename Iterator, typename Output, typename Functor>
358  const uint32 n,
359  const Iterator in,
360  const Output out,
361  const Functor functor)
362 {
363  transform( system_tag(), n, in, out, functor );
364 }
365 
366 // apply a binary functor to each pair of elements of the given sequences
367 //
368 template <typename system_tag, typename Iterator1, typename Iterator2, typename Output, typename Functor>
370  const uint32 n,
371  const Iterator1 in1,
372  const Iterator2 in2,
373  const Output out,
374  const Functor functor)
375 {
376  transform( system_tag(), n, in1, in2, out, functor );
377 }
378 
379 // host-wide reduce
380 //
381 // \param n number of items to reduce
382 // \param in a system iterator
383 // \param op the binary reduction operator
384 // \param temp_storage some temporary storage
385 //
386 template <typename InputIterator, typename BinaryOp>
387 typename std::iterator_traits<InputIterator>::value_type reduce(
388  host_tag tag,
389  const uint32 n,
390  InputIterator in,
391  BinaryOp op,
392  cugar::vector<host_tag,uint8>& temp_storage)
393 {
394  return thrust::reduce( in, in + n, 0u, op );
395 }
396 
397 // host-wide inclusive scan
398 //
399 // \param n number of items to reduce
400 // \param in a device input iterator
401 // \param out a device output iterator
402 // \param op the binary reduction operator
403 // \param temp_storage some temporary storage
404 //
405 template <typename InputIterator, typename OutputIterator, typename BinaryOp>
406 void inclusive_scan(
407  host_tag tag,
408  const uint32 n,
409  InputIterator in,
410  OutputIterator out,
411  BinaryOp op,
412  cugar::vector<host_tag,uint8>& temp_storage)
413 {
415  in,
416  in + n,
417  out,
418  op );
419 }
420 
421 // host-wide exclusive scan
422 //
423 // \param n number of items to reduce
424 // \param in a device input iterator
425 // \param out a device output iterator
426 // \param op the binary reduction operator
427 // \param identity the identity element
428 // \param temp_storage some temporary storage
429 //
430 template <typename InputIterator, typename OutputIterator, typename BinaryOp, typename Identity>
431 void exclusive_scan(
432  host_tag tag,
433  const uint32 n,
434  InputIterator in,
435  OutputIterator out,
436  BinaryOp op,
437  Identity identity,
438  cugar::vector<host_tag,uint8>& temp_storage)
439 {
441  in,
442  in + n,
443  out,
444  identity,
445  op );
446 }
447 
448 #if defined(__CUDACC__)
449 
450 // system-wide reduce
451 //
452 // \param n number of items to reduce
453 // \param in a system iterator
454 // \param op the binary reduction operator
455 // \param temp_storage some temporary storage
456 //
457 template <typename InputIterator, typename BinaryOp>
458 typename std::iterator_traits<InputIterator>::value_type reduce(
459  device_tag tag,
460  const uint32 n,
461  InputIterator in,
462  BinaryOp op,
463  cugar::vector<device_tag,uint8>& temp_storage)
464 {
465  return cuda::reduce( n, in, op, temp_storage );
466 }
467 
468 // device-wide inclusive scan
469 //
470 // \param n number of items to reduce
471 // \param in a device input iterator
472 // \param out a device output iterator
473 // \param op the binary reduction operator
474 // \param temp_storage some temporary storage
475 //
476 template <typename InputIterator, typename OutputIterator, typename BinaryOp>
477 void inclusive_scan(
478  device_tag tag,
479  const uint32 n,
480  InputIterator in,
481  OutputIterator out,
482  BinaryOp op,
483  cugar::vector<device_tag,uint8>& temp_storage)
484 {
485  cuda::inclusive_scan( n, in, out, op, temp_storage );
486 }
487 
488 // device-wide exclusive scan
489 //
490 // \param n number of items to reduce
491 // \param in a device input iterator
492 // \param out a device output iterator
493 // \param op the binary reduction operator
494 // \param identity the identity element
495 // \param temp_storage some temporary storage
496 //
497 template <typename InputIterator, typename OutputIterator, typename BinaryOp, typename Identity>
498 void exclusive_scan(
499  device_tag tag,
500  const uint32 n,
501  InputIterator in,
502  OutputIterator out,
503  BinaryOp op,
504  Identity identity,
505  cugar::vector<device_tag,uint8>& temp_storage)
506 {
507  cuda::exclusive_scan( n, in, out, op, identity, temp_storage );
508 }
509 
510 #endif
511 
512 // system-wide reduce
513 //
514 // \param n number of items to reduce
515 // \param in a system iterator
516 // \param op the binary reduction operator
517 // \param temp_storage some temporary storage
518 //
519 template <typename system_tag, typename InputIterator, typename BinaryOp>
520 typename std::iterator_traits<InputIterator>::value_type reduce(
521  const uint32 n,
522  InputIterator in,
523  BinaryOp op,
524  cugar::vector<system_tag,uint8>& temp_storage)
525 {
526  return reduce(
527  system_tag(),
528  n,
529  in,
530  op,
531  temp_storage );
532 }
533 
534 // device-wide inclusive scan
535 //
536 // \param n number of items to reduce
537 // \param in a device input iterator
538 // \param out a device output iterator
539 // \param op the binary reduction operator
540 // \param temp_storage some temporary storage
541 //
542 template <typename system_tag, typename InputIterator, typename OutputIterator, typename BinaryOp>
544  const uint32 n,
545  InputIterator in,
546  OutputIterator out,
547  BinaryOp op,
548  cugar::vector<system_tag,uint8>& temp_storage)
549 {
551  system_tag(),
552  n,
553  in,
554  out,
555  op,
556  temp_storage );
557 }
558 
559 // device-wide exclusive scan
560 //
561 // \param n number of items to reduce
562 // \param in a device input iterator
563 // \param out a device output iterator
564 // \param op the binary reduction operator
565 // \param identity the identity element
566 // \param temp_storage some temporary storage
567 //
568 template <typename system_tag, typename InputIterator, typename OutputIterator, typename BinaryOp, typename Identity>
570  const uint32 n,
571  InputIterator in,
572  OutputIterator out,
573  BinaryOp op,
574  Identity identity,
575  cugar::vector<system_tag,uint8>& temp_storage)
576 {
578  system_tag(),
579  n,
580  in,
581  out,
582  op,
583  identity,
584  temp_storage );
585 }
586 
587 // host-wide copy of flagged items
588 //
589 // \param n number of input items
590 // \param in a input iterator
591 // \param flags a flags iterator
592 // \param out a output iterator
593 // \param temp_storage some temporary storage
594 //
595 // \return the number of copied items
596 //
597 template <typename InputIterator, typename FlagsIterator, typename OutputIterator>
598 uint32 copy_flagged(
599  const host_tag tag,
600  const uint32 n,
601  InputIterator in,
602  FlagsIterator flags,
603  OutputIterator out,
604  cugar::vector<host_tag,uint8>& temp_storage)
605 {
606  return uint32( thrust::copy_if(
607  in,
608  in + n,
609  flags,
610  out,
611  cugar::is_true_functor<bool>() ) - out );
612 }
613 
614 // host-wide copy of predicated items
615 //
616 // \param n number of input items
617 // \param in a input iterator
618 // \param flags a flags iterator
619 // \param out a output iterator
620 // \param temp_storage some temporary storage
621 //
622 // \return the number of copied items
623 //
624 template <typename InputIterator, typename OutputIterator, typename Predicate>
625 uint32 copy_if(
626  const host_tag tag,
627  const uint32 n,
628  InputIterator in,
629  OutputIterator out,
630  const Predicate pred,
631  cugar::vector<host_tag,uint8>& temp_storage)
632 {
633  return uint32( thrust::copy_if(
634  in,
635  in + n,
636  out,
637  pred ) - out );
638 }
639 
640 // system-wide run-length encode
641 //
642 // \param n number of input items
643 // \param in a system input iterator
644 // \param out a system output iterator
645 // \param counts a system output count iterator
646 // \param temp_storage some temporary storage
647 //
648 // \return the number of copied items
649 //
650 template <typename InputIterator, typename OutputIterator, typename CountIterator>
651 uint32 runlength_encode(
652  const host_tag tag,
653  const uint32 n,
654  InputIterator in,
655  OutputIterator out,
656  CountIterator counts,
657  cugar::vector<host_tag,uint8>& temp_storage)
658 {
659  return uint32( thrust::reduce_by_key(
660  in,
661  in + n,
662  thrust::make_constant_iterator<uint32>( 1u ),
663  out,
664  counts ).first - out );
665 };
666 
667 
668 // system-wide run-length encode
669 //
670 // \param n number of input items
671 // \param keys_in a system input iterator
672 // \param values_in a system input iterator
673 // \param keys_out a system output iterator
674 // \param values_out a system output iterator
675 // \param reduction_op a reduction operator
676 // \param temp_storage some temporary storage
677 //
678 // \return the number of copied items
679 //
680 template <typename KeyIterator, typename ValueIterator, typename OutputKeyIterator, typename OutputValueIterator, typename ReductionOp>
681 uint32 reduce_by_key(
682  const host_tag tag,
683  const uint32 n,
684  KeyIterator keys_in,
685  ValueIterator values_in,
686  OutputKeyIterator keys_out,
687  OutputValueIterator values_out,
688  ReductionOp reduction_op,
689  cugar::vector<host_tag,uint8>& temp_storage)
690 {
691  typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
692 
693  return uint32( thrust::reduce_by_key(
694  keys_in,
695  keys_in + n,
696  values_in,
697  keys_out,
698  values_out,
700  reduction_op ).first - keys_out );
701 }
702 
703 #if defined(__CUDACC__)
704 
705 // device-wide copy of flagged items
706 //
707 // \param n number of input items
708 // \param in a input iterator
709 // \param flags a flags iterator
710 // \param out a output iterator
711 // \param temp_storage some temporary storage
712 //
713 // \return the number of copied items
714 //
715 template <typename InputIterator, typename FlagsIterator, typename OutputIterator>
716 uint32 copy_flagged(
717  const device_tag tag,
718  const uint32 n,
719  InputIterator in,
720  FlagsIterator flags,
721  OutputIterator out,
722  cugar::vector<device_tag,uint8>& temp_storage)
723 {
724  return cuda::copy_flagged( n, in, flags, out, temp_storage );
725 }
726 
727 // device-wide copy of predicated items
728 //
729 // \param n number of input items
730 // \param in a input iterator
731 // \param flags a flags iterator
732 // \param out a output iterator
733 // \param temp_storage some temporary storage
734 //
735 // \return the number of copied items
736 //
737 template <typename InputIterator, typename OutputIterator, typename Predicate>
738 uint32 copy_if(
739  const device_tag tag,
740  const uint32 n,
741  InputIterator in,
742  OutputIterator out,
743  const Predicate pred,
744  cugar::vector<device_tag,uint8>& temp_storage)
745 {
746  return cuda::copy_if( n, in, out, pred, temp_storage );
747 }
748 
749 // system-wide run-length encode
750 //
751 // \param n number of input items
752 // \param in a device input iterator
753 // \param out a device output iterator
754 // \param counts a device output count iterator
755 // \param temp_storage some temporary storage
756 //
757 // \return the number of copied items
758 //
759 template <typename InputIterator, typename OutputIterator, typename CountIterator>
760 uint32 runlength_encode(
761  const device_tag tag,
762  const uint32 n,
763  InputIterator in,
764  OutputIterator out,
765  CountIterator counts,
766  cugar::vector<device_tag,uint8>& temp_storage)
767 {
768  return cuda::runlength_encode( n, in, out, counts, temp_storage );
769 };
770 
771 // device-wide run-length encode
772 //
773 // \param n number of input items
774 // \param keys_in a device input iterator
775 // \param values_in a device input iterator
776 // \param keys_out a device output iterator
777 // \param values_out a device output iterator
778 // \param reduction_op a reduction operator
779 // \param temp_storage some temporary storage
780 //
781 // \return the number of copied items
782 //
783 template <typename KeyIterator, typename ValueIterator, typename OutputKeyIterator, typename OutputValueIterator, typename ReductionOp>
784 uint32 reduce_by_key(
785  const device_tag tag,
786  const uint32 n,
787  KeyIterator keys_in,
788  ValueIterator values_in,
789  OutputKeyIterator keys_out,
790  OutputValueIterator values_out,
791  ReductionOp reduction_op,
792  cugar::vector<device_tag,uint8>& temp_storage)
793 {
794  return cuda::reduce_by_key(
795  n,
796  keys_in,
797  values_in,
798  keys_out,
799  values_out,
800  reduction_op,
801  temp_storage );
802 }
803 
804 #endif
805 
806 // system-wide copy of flagged items
807 //
808 // \param n number of input items
809 // \param in a device input iterator
810 // \param flags a device flags iterator
811 // \param out a device output iterator
812 // \param temp_storage some temporary storage
813 //
814 // \return the number of copied items
815 //
816 template <typename system_tag, typename InputIterator, typename FlagsIterator, typename OutputIterator>
818  const uint32 n,
819  InputIterator in,
820  FlagsIterator flags,
821  OutputIterator out,
822  cugar::vector<system_tag,uint8>& temp_storage)
823 {
824  return copy_flagged( system_tag(), n, in, flags, out, temp_storage );
825 };
826 
827 // system-wide copy of predicated items
828 //
829 // \param n number of input items
830 // \param in a device input iterator
831 // \param out a device output iterator
832 // \param pred a unary predicate functor
833 // \param temp_storage some temporary storage
834 //
835 // \return the number of copied items
836 //
837 template <typename system_tag, typename InputIterator, typename OutputIterator, typename Predicate>
838 uint32 copy_if(
839  const uint32 n,
840  InputIterator in,
841  OutputIterator out,
842  const Predicate pred,
843  cugar::vector<system_tag,uint8>& temp_storage)
844 {
845  return copy_if( system_tag(), n, in, out, pred, temp_storage );
846 };
847 
848 // system-wide run-length encode
849 //
850 // \param n number of input items
851 // \param in a system input iterator
852 // \param out a system output iterator
853 // \param counts a system output count iterator
854 // \param temp_storage some temporary storage
855 //
856 // \return the number of copied items
857 //
858 template <typename system_tag, typename InputIterator, typename OutputIterator, typename CountIterator>
860  const uint32 n,
861  InputIterator in,
862  OutputIterator out,
863  CountIterator counts,
864  cugar::vector<system_tag,uint8>& temp_storage)
865 {
866  return runlength_encode( system_tag(), n, in, out, counts, temp_storage );
867 };
868 
869 // system-wide run-length encode
870 //
871 // \param n number of input items
872 // \param keys_in a system input iterator
873 // \param values_in a system input iterator
874 // \param keys_out a system output iterator
875 // \param values_out a system output iterator
876 // \param reduction_op a reduction operator
877 // \param temp_storage some temporary storage
878 //
879 // \return the number of copied items
880 //
881 template <typename system_tag, typename KeyIterator, typename ValueIterator, typename OutputKeyIterator, typename OutputValueIterator, typename ReductionOp>
883  const uint32 n,
884  KeyIterator keys_in,
885  ValueIterator values_in,
886  OutputKeyIterator keys_out,
887  OutputValueIterator values_out,
888  ReductionOp reduction_op,
889  cugar::vector<system_tag,uint8>& temp_storage)
890 {
891  return reduce_by_key(
892  system_tag(),
893  n,
894  keys_in,
895  values_in,
896  keys_out,
897  values_out,
898  reduction_op,
899  temp_storage );
900 }
901 
902 #if defined(__CUDACC__)
903 // lower_bound kernel
904 //
905 template <typename KeyIterator, typename ValueIterator, typename OutputIterator>
906 __global__
907 void lower_bound_kernel(
908  const uint32 n,
909  ValueIterator values,
910  const uint32 n_keys,
911  KeyIterator keys,
912  OutputIterator indices)
913 {
914  const uint32 i = threadIdx.x + blockIdx.x * blockDim.x;
915 
916  if (i < n)
917  indices[i] = lower_bound_index( values[i], keys, n_keys );
918 }
919 // lower_bound kernel
920 //
921 template <typename KeyIterator, typename ValueIterator, typename OutputIterator>
922 __global__
923 void upper_bound_kernel(
924  const uint32 n,
925  ValueIterator values,
926  const uint32 n_keys,
927  KeyIterator keys,
928  OutputIterator indices)
929 {
930  const uint32 i = threadIdx.x + blockIdx.x * blockDim.x;
931 
932  if (i < n)
933  indices[i] = upper_bound_index( values[i], keys, n_keys );
934 }
935 #endif
936 // device-wide lower_bound
937 //
938 // \param n number of input items
939 // \param values a system input iterator of values to be searched
940 // \param keys a system input iterator of sorted keys
941 // \param indices a system output iterator
942 //
943 template <typename KeyIterator, typename ValueIterator, typename OutputIterator>
944 void lower_bound(
945  const device_tag tag,
946  const uint32 n,
947  ValueIterator values,
948  const uint32 n_keys,
949  KeyIterator keys,
950  OutputIterator indices)
951 {
952  #if 1
954  keys, keys + n_keys,
955  values, values + n,
956  indices );
957  #elif defined(__CUDACC__)
958  const uint32 blockdim = 128;
959  const uint32 n_blocks = divide_ri( n, blockdim );
960 
961  lower_bound_kernel<<<n_blocks,blockdim>>>( n, values, n_keys, keys, indices );
962  #endif
963 }
964 
965 // host-wide lower_bound
966 //
967 // \param n number of input items
968 // \param values a system input iterator of values to be searched
969 // \param keys a system input iterator of sorted keys
970 // \param indices a system output iterator
971 //
972 template <typename KeyIterator, typename ValueIterator, typename OutputIterator>
973 void lower_bound(
974  const host_tag tag,
975  const uint32 n,
976  ValueIterator values,
977  const uint32 n_keys,
978  KeyIterator keys,
979  OutputIterator indices)
980 {
981  #pragma omp parallel for
982  for (long i = 0; i < long(n); ++i)
983  indices[i] = uint32( lower_bound( values[i], keys, n_keys ) - keys );
984 }
985 
986 // system-wide lower_bound
987 //
988 // \param n number of input items
989 // \param values a system input iterator of values to be searched
990 // \param keys a system input iterator of sorted keys
991 // \param indices a system output iterator
992 //
993 template <typename system_tag, typename KeyIterator, typename ValueIterator, typename OutputIterator>
995  const uint32 n,
996  ValueIterator values,
997  const uint32 n_keys,
998  KeyIterator keys,
999  OutputIterator indices)
1000 {
1001  lower_bound(
1002  system_tag(),
1003  n,
1004  values,
1005  n_keys,
1006  keys,
1007  indices );
1008 }
1009 
1010 // device-wide upper_bound
1011 //
1012 // \param n number of input items
1013 // \param values a system input iterator of values to be searched
1014 // \param keys a system input iterator of sorted keys
1015 // \param indices a system output iterator
1016 //
1017 template <typename KeyIterator, typename ValueIterator, typename OutputIterator>
1018 void upper_bound(
1019  const device_tag tag,
1020  const uint32 n,
1021  ValueIterator values,
1022  const uint32 n_keys,
1023  KeyIterator keys,
1024  OutputIterator indices)
1025 {
1026  #if 1
1028  keys, keys + n_keys,
1029  values, values + n,
1030  indices );
1031  #elif defined(__CUDACC__)
1032  const uint32 blockdim = 128;
1033  const uint32 n_blocks = divide_ri( n, blockdim );
1034 
1035  upper_bound_kernel<<<n_blocks,blockdim>>>( n, values, n_keys, keys, indices );
1036  #endif
1037 }
1038 
1039 // host-wide upper_bound
1040 //
1041 // \param n number of input items
1042 // \param values a system input iterator of values to be searched
1043 // \param keys a system input iterator of sorted keys
1044 // \param indices a system output iterator
1045 //
1046 template <typename KeyIterator, typename ValueIterator, typename OutputIterator>
1047 void upper_bound(
1048  const host_tag tag,
1049  const uint32 n,
1050  ValueIterator values,
1051  const uint32 n_keys,
1052  KeyIterator keys,
1053  OutputIterator indices)
1054 {
1055  #pragma omp parallel for
1056  for (long i = 0; i < long(n); ++i)
1057  indices[i] = uint32( upper_bound( values[i], keys, n_keys ) - keys );
1058 }
1059 
1060 // system-wide upper_bound
1061 //
1062 // \param n number of input items
1063 // \param values a system input iterator of values to be searched
1064 // \param keys a system input iterator of sorted keys
1065 // \param indices a system output iterator
1066 //
1067 template <typename system_tag, typename KeyIterator, typename ValueIterator, typename OutputIterator>
1069  const uint32 n,
1070  ValueIterator values,
1071  const uint32 n_keys,
1072  KeyIterator keys,
1073  OutputIterator indices)
1074 {
1075  upper_bound(
1076  system_tag(),
1077  n,
1078  values,
1079  n_keys,
1080  keys,
1081  indices );
1082 }
1083 
1084 #if defined(__CUDACC__)
1085 
1086 // device-wide sort
1087 //
1088 // \param n number of input items
1089 // \param keys a system input iterator of keys to be sorted
1090 //
1091 template <typename KeyIterator>
1092 void radix_sort(
1093  const device_tag tag,
1094  const uint32 n,
1095  KeyIterator keys,
1096  cugar::vector<device_tag,uint8>& temp_storage)
1097 {
1098  typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
1099 
1100  cuda::alloc_temp_storage( temp_storage, 2 * n * sizeof(key_type) );
1101 
1102  key_type* keys_ptr = reinterpret_cast<key_type*>( raw_pointer( temp_storage ) );
1103 
1104  thrust::device_ptr<key_type> keys_buf( keys_ptr );
1105 
1106  thrust::copy( keys, keys + n, keys_buf );
1107 
1108  cuda::SortBuffers<key_type*> sort_buffers;
1109  sort_buffers.keys[0] = keys_ptr;
1110  sort_buffers.keys[1] = keys_ptr + n;
1111 
1112  cuda::SortEnactor sort_enactor;
1113  sort_enactor.sort( n, sort_buffers );
1114 
1115  thrust::copy(
1116  keys_buf + sort_buffers.selector * n,
1117  keys_buf + sort_buffers.selector * n + n,
1118  keys );
1119 }
1120 
1121 // device-wide sort by key
1122 //
1123 // \param n number of input items
1124 // \param keys a system input iterator of keys to be sorted
1125 // \param values a system input iterator of values to be sorted
1126 //
1127 template <typename KeyIterator, typename ValueIterator>
1128 void radix_sort(
1129  const device_tag tag,
1130  const uint32 n,
1131  KeyIterator keys,
1132  ValueIterator values,
1133  cugar::vector<device_tag,uint8>& temp_storage)
1134 {
1135  typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
1136  typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
1137 
1138  const uint32 aligned_key_bytes = align<16>( 2 * n * sizeof(key_type) );
1139  const uint32 aligned_val_bytes = 2 * n * sizeof(value_type);
1140  cuda::alloc_temp_storage( temp_storage, aligned_key_bytes + aligned_val_bytes );
1141 
1142  key_type* keys_ptr = reinterpret_cast<key_type*>( raw_pointer( temp_storage ) );
1143  value_type* values_ptr = reinterpret_cast<value_type*>( raw_pointer( temp_storage ) + aligned_key_bytes );
1144 
1145  thrust::device_ptr<key_type> keys_buf( keys_ptr );
1146  thrust::device_ptr<key_type> values_buf( values_ptr );
1147 
1148  thrust::copy( keys, keys + n, keys_buf );
1149  thrust::copy( values, values + n, values_buf );
1150 
1152  sort_buffers.keys[0] = keys_ptr;
1153  sort_buffers.keys[1] = keys_ptr + n;
1154  sort_buffers.values[0] = values_ptr;
1155  sort_buffers.values[1] = values_ptr + n;
1156 
1157  cuda::SortEnactor sort_enactor;
1158  sort_enactor.sort( n, sort_buffers );
1159 
1160  thrust::copy(
1161  keys_buf + sort_buffers.selector * n,
1162  keys_buf + sort_buffers.selector * n + n,
1163  keys );
1164 
1165  thrust::copy(
1166  values_buf + sort_buffers.selector * n,
1167  values_buf + sort_buffers.selector * n + n,
1168  values );
1169 }
1170 
1171 #endif
1172 
1173 // host-wide sort
1174 //
1175 // \param n number of input items
1176 // \param keys a system input iterator of keys to be sorted
1177 //
1178 template <typename KeyIterator>
1179 void radix_sort(
1180  const host_tag tag,
1181  const uint32 n,
1182  KeyIterator keys,
1183  cugar::vector<host_tag,uint8>& temp_storage)
1184 {
1185  thrust::sort( keys, keys + n );
1186 }
1187 
1188 // system-wide sort
1189 //
1190 // \param n number of input items
1191 // \param keys a system input iterator of keys to be sorted
1192 //
1193 template <typename system_tag, typename KeyIterator>
1195  const uint32 n,
1196  KeyIterator keys,
1197  cugar::vector<system_tag,uint8>& temp_storage)
1198 {
1199  radix_sort( system_tag(), n, keys, temp_storage );
1200 }
1201 
1202 // host-wide sort by key
1203 //
1204 // \param n number of input items
1205 // \param keys a system input iterator of keys to be sorted
1206 // \param values a system input iterator of values to be sorted
1207 //
1208 template <typename KeyIterator, typename ValueIterator>
1209 void radix_sort(
1210  const host_tag tag,
1211  const uint32 n,
1212  KeyIterator keys,
1213  ValueIterator values,
1214  cugar::vector<host_tag,uint8>& temp_storage)
1215 {
1216  thrust::sort_by_key( keys, keys + n, values, temp_storage );
1217 }
1218 
1219 // system-wide sort by key
1220 //
1221 // \param n number of input items
1222 // \param keys a system input iterator of keys to be sorted
1223 // \param values a system input iterator of values to be sorted
1224 //
1225 template <typename system_tag, typename KeyIterator, typename ValueIterator>
1227  const uint32 n,
1228  KeyIterator keys,
1229  ValueIterator values,
1230  cugar::vector<system_tag,uint8>& temp_storage)
1231 {
1232  radix_sort( system_tag(), n, keys, values, temp_storage );
1233 }
1234 
1235 template <
1236  typename key_iterator1,
1237  typename key_iterator2>
1238 CUGAR_HOST_DEVICE
1239 uint2 corank(
1240  const int32 i,
1241  const key_iterator1 A,
1242  const int32 m,
1243  const key_iterator2 B,
1244  const int32 n)
1245 {
1246  int32 j = min( i, m );
1247  int32 k = i - j;
1248 
1249  int32 j_lo = i >= n ? i - n : 0;
1250  int32 k_lo = 0;
1251 
1252  while (1)
1253  {
1254  if ((j > 0 || k < n) && A[j-1] > B[k])
1255  {
1256  // decrease j
1257  const int32 delta = divide_ri( j - j_lo, 2 );
1258  k_lo = k;
1259  j -= delta;
1260  k += delta;
1261  assert( j + k == i );
1262  }
1263  else if ((k > 0 || j < m) && B[k-1] >= A[j])
1264  {
1265  // decrease k
1266  const int32 delta = divide_ri( k - k_lo, 2 );
1267  j_lo = j;
1268  j += delta;
1269  k -= delta;
1270  assert( j + k == i );
1271  }
1272  else
1273  break;
1274  }
1275  return make_uint2( uint32(j), uint32(k) );
1276 }
1277 
1278 template <
1279  typename key_iterator1,
1280  typename key_iterator2,
1281  typename value_iterator1,
1282  typename value_iterator2,
1283  typename key_output,
1284  typename value_output>
1285 void merge_by_key(
1286  const host_tag tag,
1287  const uint32 A_len,
1288  const uint32 B_len,
1289  const key_iterator1 A_keys,
1290  const key_iterator2 B_keys,
1291  const value_iterator1 A_values,
1292  const value_iterator2 B_values,
1293  key_output C_keys,
1294  value_output C_values)
1295 {
1296  if (A_len == 0)
1297  {
1298  #pragma omp parallel for
1299  for (int32 i = 0; i < int32( B_len ); ++i)
1300  {
1301  C_keys[i] = A_keys[i];
1302  C_values[i] = A_values[i];
1303  }
1304  }
1305  else if (B_len == 0)
1306  {
1307  #pragma omp parallel for
1308  for (int32 i = 0; i < int32( A_len ); ++i)
1309  {
1310  C_keys[i] = A_keys[i];
1311  C_values[i] = A_values[i];
1312  }
1313  }
1314 
1315  const uint32 n_threads = (uint32)omp_get_num_procs();
1316 
1317  cugar::vector<host_tag,uint32> A_diag( n_threads+1 );
1318  cugar::vector<host_tag,uint32> B_diag( n_threads+1 );
1319 
1320  const uint32 C_len = A_len + B_len;
1321 
1322  A_diag[ n_threads ] = 0;
1323  B_diag[ n_threads ] = 0;
1324  A_diag[ n_threads ] = A_len;
1325  B_diag[ n_threads ] = B_len;
1326 
1327  const uint32 n_partition = divide_ri( C_len, n_threads );
1328 
1329  #pragma omp parallel for num_threads(n_threads)
1330  for (int32 i = 1; i < int32( n_threads ); ++i)
1331  {
1332  const int32 index = i * n_partition;
1333 
1334  const uint2 jk = corank( index, A_keys, A_len, B_keys, B_len );
1335 
1336  A_diag[i] = jk.x;
1337  B_diag[i] = jk.y;
1338  }
1339 
1340  #pragma omp parallel for num_threads(n_threads)
1341  for (int32 i = 0; i < int32( n_threads ); ++i)
1342  {
1344  A_keys + A_diag[i],
1345  A_keys + A_diag[i+1],
1346  B_keys + B_diag[i],
1347  B_keys + B_diag[i+1],
1348  A_values + A_diag[i],
1349  B_values + B_diag[i],
1350  C_keys + i * n_partition,
1351  C_values + i * n_partition );
1352  }
1353 /* for (uint32 i = 1; i < C_len; ++i)
1354  {
1355  if (C_keys[i-1] > C_keys[i])
1356  {
1357  fprintf(stderr, "merging error at %u: %llu, %llu\n", i, C_keys[i-1], C_keys[i] );
1358  exit(1);
1359  }
1360  }*/
1361 }
1362 
1363 template <
1364  typename key_iterator1,
1365  typename key_iterator2,
1366  typename value_iterator1,
1367  typename value_iterator2,
1368  typename key_output,
1369  typename value_output>
1370 void merge_by_key(
1371  const device_tag tag,
1372  const uint32 A_len,
1373  const uint32 B_len,
1374  const key_iterator1 A_keys,
1375  const key_iterator2 B_keys,
1376  const value_iterator1 A_values,
1377  const value_iterator2 B_values,
1378  key_output C_keys,
1379  value_output C_values)
1380 {
1382  A_keys,
1383  A_keys + A_len,
1384  B_keys,
1385  B_keys + A_len,
1386  A_values,
1387  B_values,
1388  C_keys,
1389  C_values );
1390 }
1391 
1392 template <
1393  typename system_tag,
1394  typename key_iterator1,
1395  typename key_iterator2,
1396  typename value_iterator1,
1397  typename value_iterator2,
1398  typename key_output,
1399  typename value_output>
1401  const uint32 A_len,
1402  const uint32 B_len,
1403  const key_iterator1 A_keys,
1404  const key_iterator2 B_keys,
1405  const value_iterator1 A_values,
1406  const value_iterator2 B_values,
1407  key_output C_keys,
1408  value_output C_values,
1409  cugar::vector<system_tag,uint8>& temp_storage)
1410 {
1411  merge_by_key(
1412  system_tag(),
1413  A_len,
1414  B_len,
1415  A_keys,
1416  B_keys,
1417  A_values,
1418  B_values,
1419  C_keys,
1420  C_values );
1421 }
1422 
1423 #if defined(__CUDACC__)
1424 
1427 template <typename iterator_type, typename functor_type>
1428 __global__
1429 void for_each_kernel(const uint64 n, const iterator_type in, const functor_type f)
1430 {
1431  const uint32 grid_size = blockDim.x * gridDim.x;
1432 
1433  for (uint64 i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += grid_size)
1434  f( in[i] );
1435 };
1436 
1437 #endif
1438 
1439 // ask the optimizer how many blocks we should try using next
1440 //
1441 template <typename KernelFunction>
1442 uint32 for_each_enactor<device_tag>::suggested_blocks(KernelFunction kernel, const uint32 cta_size) const
1443 {
1444 #if defined(__CUDACC__)
1445  if (m_blocks_hi == 0)
1446  return cuda::multiprocessor_count() * cuda::max_active_blocks_per_multiprocessor( kernel, cta_size, 0u );
1447  else if (m_blocks_lo == 0)
1448  return cuda::multiprocessor_count();
1449  else
1450  return cuda::multiprocessor_count() * (m_blocks_lo + m_blocks_hi) / 2;
1451 #else
1452  return 0u;
1453 #endif
1454 }
1455 
1456 // update the optimizer's internal state with the latest speed data-point
1457 //
1458 inline
1459 void for_each_enactor<device_tag>::update(const uint32 n_blocks, const float speed)
1460 {
1461 #if defined(__CUDACC__)
1462  // carry out a little binary search over the best number of blocks/SM
1463  if (m_blocks_hi == 0)
1464  {
1465  m_blocks_hi = uint32(n_blocks / cuda::multiprocessor_count());
1466  m_speed_hi = speed;
1467  }
1468  else if (m_blocks_lo == 0)
1469  {
1470  m_blocks_lo = uint32(n_blocks / cuda::multiprocessor_count());
1471  m_speed_lo = speed;
1472  }
1473  else if (m_speed_lo > m_speed_hi)
1474  {
1475  m_blocks_hi = uint32(n_blocks / cuda::multiprocessor_count());
1476  m_speed_hi = speed;
1477  }
1478  else
1479  {
1480  m_blocks_lo = uint32(n_blocks / cuda::multiprocessor_count());
1481  m_speed_lo = speed;
1482  }
1483  // TODO: once the optimizer settles to a given value, it will never change:
1484  // we should explore using occasional "mutations" to adapt to possibly
1485  // changing conditions...
1486 #endif
1487 }
1488 
1489 // enact the for_each
1490 //
1491 template <typename Iterator, typename Functor>
1493  const uint64 n,
1494  const Iterator in,
1495  Functor functor)
1496 {
1497 #if defined(__CUDACC__)
1498  const uint32 blockdim = 128;
1499  const uint32 n_blocks = suggested_blocks( for_each_kernel<Iterator,Functor>, blockdim );
1500 
1501  cuda::Timer timer;
1502  timer.start();
1503 
1504  for_each_kernel<<<n_blocks,blockdim>>>( n, in, functor );
1505 
1506  timer.stop();
1507 
1508  update( n_blocks, float(n) / timer.seconds() );
1509 #endif
1510 }
1511 
1512 } // namespace cugar
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE index_type upper_bound_index(const Value x, Iterator begin, const index_type n)
Definition: algorithms.h:193
Definition: timer.h:46
void radix_sort(const uint32 n, KeyIterator keys, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:1194
void transform(const uint32 n, const Iterator in, const Output out, const Functor functor)
Definition: primitives_inl.h:357
Definition: primitives_inl.h:111
std::iterator_traits< InputIterator >::value_type reduce(const uint32 n, InputIterator d_in, BinaryOp op, thrust::device_vector< uint8 > &d_temp_storage)
Definition: primitives_inl.h:185
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE void merge_by_key(key_iterator1 first1, key_iterator1 end1, key_iterator2 first2, key_iterator2 end2, value_iterator1 values1, value_iterator2 values2, key_iterator output_keys, value_iterator output_values)
Definition: algorithms.h:273
Definition: sort.h:103
bool any(const uint32 n, const PredicateIterator pred)
Definition: primitives_inl.h:91
uint32 copy_if(const uint32 n, InputIterator d_in, OutputIterator d_out, const Predicate pred, thrust::device_vector< uint8 > &d_temp_storage)
Definition: primitives_inl.h:349
uint32 runlength_encode(const uint32 n, InputIterator d_in, OutputIterator d_out, CountIterator d_counts, thrust::device_vector< uint8 > &d_temp_storage)
Definition: primitives_inl.h:392
Definition: types.h:181
void alloc_temp_storage(VectorType &vec, const uint64 size)
Definition: primitives_inl.h:39
T * raw_pointer(thrust::device_vector< T, Alloc > &vec)
Definition: thrust_view.h:69
bool is_sorted(const uint32 n, const Iterator values)
Definition: primitives_inl.h:234
Definition: primitives_inl.h:172
bool all(const uint32 n, const PredicateIterator pred)
Definition: primitives_inl.h:107
uint32 runlength_encode(const uint32 n, InputIterator in, OutputIterator out, CountIterator counts, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:859
CUGAR_HOST_DEVICE L divide_ri(const L x, const R y)
Definition: numbers.h:180
void start()
Definition: timer.h:103
void inclusive_scan(const uint32 n, InputIterator d_in, OutputIterator d_out, BinaryOp op, thrust::device_vector< uint8 > &d_temp_storage)
Definition: primitives_inl.h:228
uint32 reduce_by_key(const uint32 n, KeyIterator d_keys_in, ValueIterator d_values_in, OutputKeyIterator d_keys_out, OutputValueIterator d_values_out, ReductionOp reduction_op, thrust::device_vector< uint8 > &d_temp_storage)
Definition: primitives_inl.h:438
bool all(const uint32 n, const PredicateIterator pred)
Definition: primitives_inl.h:101
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE Iterator upper_bound(const Value x, Iterator begin, const index_type n)
Definition: algorithms.h:138
bool any(const uint32 n, const PredicateIterator pred)
Definition: primitives_inl.h:91
Definition: types.h:185
bool is_segment_sorted(const uint32 n, const Iterator values, const Headflags flags)
Definition: primitives_inl.h:245
uint32 copy_flagged(const uint32 n, InputIterator d_in, FlagsIterator d_flags, OutputIterator d_out, thrust::device_vector< uint8 > &d_temp_storage)
Definition: primitives_inl.h:306
Definition: vector.h:117
void exclusive_scan(const uint32 n, InputIterator d_in, OutputIterator d_out, BinaryOp op, Identity identity, thrust::device_vector< uint8 > &d_temp_storage)
Definition: primitives_inl.h:265
void exclusive_scan(const uint32 n, InputIterator in, OutputIterator out, BinaryOp op, Identity identity, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:569
uint32 reduce_by_key(const uint32 n, KeyIterator keys_in, ValueIterator values_in, OutputKeyIterator keys_out, OutputValueIterator values_out, ReductionOp reduction_op, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:882
Define a vector_view POD type and plain_view() for std::vector.
Definition: diff.h:38
Definition: primitives.h:371
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE index_type lower_bound_index(const Value x, Iterator begin, const index_type n)
Definition: algorithms.h:179
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE Iterator lower_bound(const Value x, Iterator begin, const index_type n)
Definition: algorithms.h:99
uint32 copy_if(const uint32 n, InputIterator in, OutputIterator out, const Predicate pred, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:838
float seconds() const
Definition: timer.h:118
std::iterator_traits< InputIterator >::value_type reduce(const uint32 n, InputIterator in, BinaryOp op, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:520
void inclusive_scan(const uint32 n, InputIterator in, OutputIterator out, BinaryOp op, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:543
void stop()
Definition: timer.h:110
Definition: sort.h:160
void for_each(const uint64 n, const Iterator in, Functor functor)
Definition: primitives_inl.h:284
uint32 copy_flagged(const uint32 n, InputIterator in, FlagsIterator flags, OutputIterator out, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:817
Definition: functors.h:214
Definition: functors.h:754
void operator()(const uint64 n, const Iterator in, Functor functor)
Definition: primitives.h:376