NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
primitives_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 namespace nvbio {
31 
32 // return true if any item in the range [0,n) evaluates to true
33 //
34 template <typename PredicateIterator>
35 bool any(
36  const host_tag tag,
37  const uint32 n,
38  const PredicateIterator pred)
39 {
40  return thrust::reduce(
41  pred,
42  pred + n,
43  false,
44  thrust::logical_or<bool>() );
45 }
46 
47 // return true if all items in the range [0,n) evaluate to true
48 //
49 template <typename PredicateIterator>
50 bool all(
51  const host_tag tag,
52  const uint32 n,
53  const PredicateIterator pred)
54 {
55  return thrust::reduce(
56  pred,
57  pred + n,
58  true,
59  thrust::logical_and<bool>() );
60 }
61 
62 #if defined(__CUDACC__)
63 
64 // return true if any item in the range [0,n) evaluates to true
65 //
66 template <typename PredicateIterator>
67 bool any(
68  const device_tag tag,
69  const uint32 n,
70  const PredicateIterator pred)
71 {
72  return cuda::any( n, pred );
73 }
74 
75 // return true if any item in the range [0,n) evaluates to true
76 //
77 template <typename PredicateIterator>
78 bool all(
79  const device_tag tag,
80  const uint32 n,
81  const PredicateIterator pred)
82 {
83  return cuda::all( n, pred );
84 }
85 
86 #endif
87 
88 // return true if any item in the range [0,n) evaluates to true
89 //
90 template <typename system_tag, typename PredicateIterator>
91 bool any(
92  const uint32 n,
93  const PredicateIterator pred)
94 {
95  return any( system_tag(), n, pred );
96 }
97 
98 // return true if all items in the range [0,n) evaluate to true
99 //
100 template <typename system_tag, typename PredicateIterator>
101 bool all(
102  const uint32 n,
103  const PredicateIterator pred)
104 {
105  return all( system_tag(), n, pred );
106 }
107 
108 // a pseudo-iterator to evaluate the predicate (it1[i] <= it2[i]) for arbitrary iterator pairs
109 //
110 template <typename Iterator1, typename Iterator2>
112 {
113  typedef bool value_type;
116  typedef value_type* pointer;
117  typedef typename std::iterator_traits<Iterator1>::difference_type difference_type;
118  typedef typename std::iterator_traits<Iterator1>::iterator_category iterator_category;
119 
120  // constructor
122  is_sorted_iterator(const Iterator1 _it1, const Iterator2 _it2) : it1( _it1 ), it2( _it2 ) {}
123 
124  // dereference operator
126  bool operator[] (const uint64 i) const { return it1[i] <= it2[i]; }
127 
128  // dereference operator
130  bool operator* () const { return it1[0] <= it2[0]; }
131 
132  // dereference operator
134  is_sorted_iterator& operator++ () { ++it1; ++it2; return *this; }
135 
136  Iterator1 it1;
137  Iterator2 it2;
138 };
139 
140 // operator+
141 template <typename T1, typename T2>
144 {
145  return is_sorted_iterator<T1,T2>( it.it1 + i, it.it2 + i );
146 }
147 // operator-
148 template <typename T1, typename T2>
151 {
152  return it1.it1 - it2.it1;
153 }
154 // operator!=
155 template <typename T1, typename T2>
158 {
159  return it1.it1 != it2.it1;
160 }
161 // operator==
162 template <typename T1, typename T2>
165 {
166  return it1.it1 == it2.it1;
167 }
168 
169 // a pseudo-iterator to evaluate the predicate (hd[i] || (it1[i] <= it2[i])) for arbitrary iterator pairs
170 //
171 template <typename Iterator1, typename Iterator2, typename Headflags>
173 {
174  typedef bool value_type;
177  typedef value_type* pointer;
178  typedef typename std::iterator_traits<Iterator1>::difference_type difference_type;
179  typedef typename std::iterator_traits<Iterator1>::iterator_category iterator_category;
180 
181  // constructor
183  is_segment_sorted_iterator(const Iterator1 _it1, const Iterator2 _it2, const Headflags _hd) : it1( _it1 ), it2( _it2 ), hd(_hd) {}
184 
185  // dereference operator
187  bool operator[] (const uint64 i) const { return hd[i] || (it1[i] <= it2[i]); }
188 
189  // dereference operator
191  bool operator* () const { return hd[0] || (it1[0] <= it2[0]); }
192 
193  // dereference operator
195  is_segment_sorted_iterator& operator++ () { ++it1; ++it2; ++hd; return *this; }
196 
197  Iterator1 it1;
198  Iterator2 it2;
199  Headflags hd;
200 };
201 
202 // operator+
203 template <typename T1, typename T2, typename H>
206 {
207  return is_segment_sorted_iterator<T1,T2,H>( it.it1 + i, it.it2 + i, it.hd + i );
208 }
209 // operator-
210 template <typename T1, typename T2, typename H>
213 {
214  return it1.it1 - it2.it1;
215 }
216 // operator!=
217 template <typename T1, typename T2, typename H>
220 {
221  return it1.it1 != it2.it1;
222 }
223 // operator==
224 template <typename T1, typename T2, typename H>
227 {
228  return it1.it1 == it2.it1;
229 }
230 
231 // return true if the items in the range [0,n) are sorted
232 //
233 template <typename system_tag, typename Iterator>
235  const uint32 n,
236  const Iterator values)
237 {
238  return all<system_tag>( n-1, is_sorted_iterator<Iterator,Iterator>( values, values+1 ) );
239 }
240 
241 // return true if the items in the range [0,n) are sorted by segment, where
242 // the beginning of each segment is identified by a set head flag
243 //
244 template <typename system_tag, typename Iterator, typename Headflags>
246  const uint32 n,
247  const Iterator values,
248  const Headflags flags)
249 {
250  return all<system_tag>( n-1, is_segment_sorted_iterator<Iterator,Iterator,Headflags>( values, values+1, flags+1 ) );
251 }
252 
253 // invoke a functor for each element of the given sequence
254 //
255 template <typename Iterator, typename Functor>
256 void for_each(
257  const host_tag tag,
258  const uint64 n,
259  const Iterator in,
260  Functor functor)
261 {
262  #if defined(_OPENMP)
263  #pragma omp parallel for if (n >= 256)
264  #endif
265  for (int64 i = 0; i < int64(n); ++i)
266  functor( in[i] );
267 }
268 
269 // invoke a functor for each element of the given sequence
270 //
271 template <typename Iterator, typename Functor>
272 void for_each(
273  const device_tag tag,
274  const uint64 n,
275  const Iterator in,
276  Functor functor)
277 {
278  thrust::for_each( in, in + n, functor );
279 }
280 
281 // invoke a functor for each element of the given sequence
282 //
283 template <typename system_tag, typename Iterator, typename Functor>
284 void for_each(
285  const uint64 n,
286  const Iterator in,
287  Functor functor)
288 {
289  return for_each( system_tag(), n, in, functor );
290 }
291 
292 // apply a functor to each element of the given sequence
293 //
294 template <typename Iterator, typename Output, typename Functor>
296  const device_tag tag,
297  const uint64 n,
298  const Iterator in,
299  const Output out,
300  const Functor functor)
301 {
302  thrust::transform( in, in + n, out, functor );
303 }
304 
305 // apply a functor to each element of the given sequence
306 //
307 template <typename Iterator, typename Output, typename Functor>
309  const host_tag tag,
310  const uint32 n,
311  const Iterator in,
312  const Output out,
313  const Functor functor)
314 {
315  #if defined(_OPENMP)
316  #pragma omp parallel for if (n >= 256)
317  #endif
318  for (int64 i = 0; i < int64(n); ++i)
319  out[i] = functor( in[i] );
320 }
321 
322 // apply a binary functor to each pair of elements of the given sequences
323 //
324 template <typename Iterator1, typename Iterator2, typename Output, typename Functor>
326  const device_tag tag,
327  const uint32 n,
328  const Iterator1 in1,
329  const Iterator2 in2,
330  const Output out,
331  const Functor functor)
332 {
333  thrust::transform( in1, in1 + n, in2, out, functor );
334 }
335 
336 // apply a binary functor to each pair of elements of the given sequences
337 //
338 template <typename Iterator1, typename Iterator2, typename Output, typename Functor>
340  const host_tag tag,
341  const uint32 n,
342  const Iterator1 in1,
343  const Iterator2 in2,
344  const Output out,
345  const Functor functor)
346 {
347  #if defined(_OPENMP)
348  #pragma omp parallel for if (n >= 256)
349  #endif
350  for (int64 i = 0; i < int64(n); ++i)
351  out[i] = functor( in1[i], in2[i] );
352 }
353 
354 // apply a functor to each element of the given sequence
355 //
356 template <typename system_tag, typename Iterator, typename Output, typename Functor>
358  const uint32 n,
359  const Iterator in,
360  const Output out,
361  const Functor functor)
362 {
363  transform( system_tag(), n, in, out, functor );
364 }
365 
366 // apply a binary functor to each pair of elements of the given sequences
367 //
368 template <typename system_tag, typename Iterator1, typename Iterator2, typename Output, typename Functor>
370  const uint32 n,
371  const Iterator1 in1,
372  const Iterator2 in2,
373  const Output out,
374  const Functor functor)
375 {
376  transform( system_tag(), n, in1, in2, out, functor );
377 }
378 
379 // host-wide reduce
380 //
381 // \param n number of items to reduce
382 // \param in a system iterator
383 // \param op the binary reduction operator
384 // \param temp_storage some temporary storage
385 //
386 template <typename InputIterator, typename BinaryOp>
387 typename std::iterator_traits<InputIterator>::value_type reduce(
388  host_tag tag,
389  const uint32 n,
390  InputIterator in,
391  BinaryOp op,
392  nvbio::vector<host_tag,uint8>& temp_storage)
393 {
394  return thrust::reduce( in, in + n, 0u, op );
395 }
396 
397 // host-wide inclusive scan
398 //
399 // \param n number of items to reduce
400 // \param in a device input iterator
401 // \param out a device output iterator
402 // \param op the binary reduction operator
403 // \param temp_storage some temporary storage
404 //
405 template <typename InputIterator, typename OutputIterator, typename BinaryOp>
407  host_tag tag,
408  const uint32 n,
409  InputIterator in,
410  OutputIterator out,
411  BinaryOp op,
412  nvbio::vector<host_tag,uint8>& temp_storage)
413 {
415  in,
416  in + n,
417  out,
418  op );
419 }
420 
421 // host-wide exclusive scan
422 //
423 // \param n number of items to reduce
424 // \param in a device input iterator
425 // \param out a device output iterator
426 // \param op the binary reduction operator
427 // \param identity the identity element
428 // \param temp_storage some temporary storage
429 //
430 template <typename InputIterator, typename OutputIterator, typename BinaryOp, typename Identity>
432  host_tag tag,
433  const uint32 n,
434  InputIterator in,
435  OutputIterator out,
436  BinaryOp op,
437  Identity identity,
438  nvbio::vector<host_tag,uint8>& temp_storage)
439 {
441  in,
442  in + n,
443  out,
444  identity,
445  op );
446 }
447 
448 #if defined(__CUDACC__)
449 
450 // system-wide reduce
451 //
452 // \param n number of items to reduce
453 // \param in a system iterator
454 // \param op the binary reduction operator
455 // \param temp_storage some temporary storage
456 //
457 template <typename InputIterator, typename BinaryOp>
458 typename std::iterator_traits<InputIterator>::value_type reduce(
459  device_tag tag,
460  const uint32 n,
461  InputIterator in,
462  BinaryOp op,
463  nvbio::vector<device_tag,uint8>& temp_storage)
464 {
465  return cuda::reduce( n, in, op, temp_storage );
466 }
467 
468 // device-wide inclusive scan
469 //
470 // \param n number of items to reduce
471 // \param in a device input iterator
472 // \param out a device output iterator
473 // \param op the binary reduction operator
474 // \param temp_storage some temporary storage
475 //
476 template <typename InputIterator, typename OutputIterator, typename BinaryOp>
477 void inclusive_scan(
478  device_tag tag,
479  const uint32 n,
480  InputIterator in,
481  OutputIterator out,
482  BinaryOp op,
483  nvbio::vector<device_tag,uint8>& temp_storage)
484 {
485  cuda::inclusive_scan( n, in, out, op, temp_storage );
486 }
487 
488 // device-wide exclusive scan
489 //
490 // \param n number of items to reduce
491 // \param in a device input iterator
492 // \param out a device output iterator
493 // \param op the binary reduction operator
494 // \param identity the identity element
495 // \param temp_storage some temporary storage
496 //
497 template <typename InputIterator, typename OutputIterator, typename BinaryOp, typename Identity>
498 void exclusive_scan(
499  device_tag tag,
500  const uint32 n,
501  InputIterator in,
502  OutputIterator out,
503  BinaryOp op,
504  Identity identity,
505  nvbio::vector<device_tag,uint8>& temp_storage)
506 {
507  cuda::exclusive_scan( n, in, out, op, identity, temp_storage );
508 }
509 
510 #endif
511 
512 // system-wide reduce
513 //
514 // \param n number of items to reduce
515 // \param in a system iterator
516 // \param op the binary reduction operator
517 // \param temp_storage some temporary storage
518 //
519 template <typename system_tag, typename InputIterator, typename BinaryOp>
520 typename std::iterator_traits<InputIterator>::value_type reduce(
521  const uint32 n,
522  InputIterator in,
523  BinaryOp op,
524  nvbio::vector<system_tag,uint8>& temp_storage)
525 {
526  return reduce(
527  system_tag(),
528  n,
529  in,
530  op,
531  temp_storage );
532 }
533 
534 // device-wide inclusive scan
535 //
536 // \param n number of items to reduce
537 // \param in a device input iterator
538 // \param out a device output iterator
539 // \param op the binary reduction operator
540 // \param temp_storage some temporary storage
541 //
542 template <typename system_tag, typename InputIterator, typename OutputIterator, typename BinaryOp>
544  const uint32 n,
545  InputIterator in,
546  OutputIterator out,
547  BinaryOp op,
548  nvbio::vector<system_tag,uint8>& temp_storage)
549 {
551  system_tag(),
552  n,
553  in,
554  out,
555  op,
556  temp_storage );
557 }
558 
559 // device-wide exclusive scan
560 //
561 // \param n number of items to reduce
562 // \param in a device input iterator
563 // \param out a device output iterator
564 // \param op the binary reduction operator
565 // \param identity the identity element
566 // \param temp_storage some temporary storage
567 //
568 template <typename system_tag, typename InputIterator, typename OutputIterator, typename BinaryOp, typename Identity>
570  const uint32 n,
571  InputIterator in,
572  OutputIterator out,
573  BinaryOp op,
574  Identity identity,
575  nvbio::vector<system_tag,uint8>& temp_storage)
576 {
578  system_tag(),
579  n,
580  in,
581  out,
582  op,
583  identity,
584  temp_storage );
585 }
586 
587 // host-wide copy of flagged items
588 //
589 // \param n number of input items
590 // \param in a input iterator
591 // \param flags a flags iterator
592 // \param out a output iterator
593 // \param temp_storage some temporary storage
594 //
595 // \return the number of copied items
596 //
597 template <typename InputIterator, typename FlagsIterator, typename OutputIterator>
599  const host_tag tag,
600  const uint32 n,
601  InputIterator in,
602  FlagsIterator flags,
603  OutputIterator out,
604  nvbio::vector<host_tag,uint8>& temp_storage)
605 {
606  return uint32( thrust::copy_if(
607  in,
608  in + n,
609  flags,
610  out,
611  nvbio::is_true_functor<bool>() ) - out );
612 }
613 
614 // host-wide copy of predicated items
615 //
616 // \param n number of input items
617 // \param in a input iterator
618 // \param flags a flags iterator
619 // \param out a output iterator
620 // \param temp_storage some temporary storage
621 //
622 // \return the number of copied items
623 //
624 template <typename InputIterator, typename OutputIterator, typename Predicate>
626  const host_tag tag,
627  const uint32 n,
628  InputIterator in,
629  OutputIterator out,
630  const Predicate pred,
631  nvbio::vector<host_tag,uint8>& temp_storage)
632 {
633  return uint32( thrust::copy_if(
634  in,
635  in + n,
636  out,
637  pred ) - out );
638 }
639 
640 // system-wide run-length encode
641 //
642 // \param n number of input items
643 // \param in a system input iterator
644 // \param out a system output iterator
645 // \param counts a system output count iterator
646 // \param temp_storage some temporary storage
647 //
648 // \return the number of copied items
649 //
650 template <typename InputIterator, typename OutputIterator, typename CountIterator>
652  const host_tag tag,
653  const uint32 n,
654  InputIterator in,
655  OutputIterator out,
656  CountIterator counts,
657  nvbio::vector<host_tag,uint8>& temp_storage)
658 {
660  in,
661  in + n,
662  thrust::make_constant_iterator<uint32>( 1u ),
663  out,
664  counts ).first - out );
665 };
666 
667 
668 // system-wide run-length encode
669 //
670 // \param n number of input items
671 // \param keys_in a system input iterator
672 // \param values_in a system input iterator
673 // \param keys_out a system output iterator
674 // \param values_out a system output iterator
675 // \param reduction_op a reduction operator
676 // \param temp_storage some temporary storage
677 //
678 // \return the number of copied items
679 //
680 template <typename KeyIterator, typename ValueIterator, typename OutputKeyIterator, typename OutputValueIterator, typename ReductionOp>
682  const host_tag tag,
683  const uint32 n,
684  KeyIterator keys_in,
685  ValueIterator values_in,
686  OutputKeyIterator keys_out,
687  OutputValueIterator values_out,
688  ReductionOp reduction_op,
689  nvbio::vector<host_tag,uint8>& temp_storage)
690 {
691  typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
692 
694  keys_in,
695  keys_in + n,
696  values_in,
697  keys_out,
698  values_out,
700  reduction_op ).first - keys_out );
701 }
702 
703 #if defined(__CUDACC__)
704 
705 // device-wide copy of flagged items
706 //
707 // \param n number of input items
708 // \param in a input iterator
709 // \param flags a flags iterator
710 // \param out a output iterator
711 // \param temp_storage some temporary storage
712 //
713 // \return the number of copied items
714 //
715 template <typename InputIterator, typename FlagsIterator, typename OutputIterator>
717  const device_tag tag,
718  const uint32 n,
719  InputIterator in,
720  FlagsIterator flags,
721  OutputIterator out,
722  nvbio::vector<device_tag,uint8>& temp_storage)
723 {
724  return cuda::copy_flagged( n, in, flags, out, temp_storage );
725 }
726 
727 // device-wide copy of predicated items
728 //
729 // \param n number of input items
730 // \param in a input iterator
731 // \param flags a flags iterator
732 // \param out a output iterator
733 // \param temp_storage some temporary storage
734 //
735 // \return the number of copied items
736 //
737 template <typename InputIterator, typename OutputIterator, typename Predicate>
739  const device_tag tag,
740  const uint32 n,
741  InputIterator in,
742  OutputIterator out,
743  const Predicate pred,
744  nvbio::vector<device_tag,uint8>& temp_storage)
745 {
746  return cuda::copy_if( n, in, out, pred, temp_storage );
747 }
748 
749 // system-wide run-length encode
750 //
751 // \param n number of input items
752 // \param in a device input iterator
753 // \param out a device output iterator
754 // \param counts a device output count iterator
755 // \param temp_storage some temporary storage
756 //
757 // \return the number of copied items
758 //
759 template <typename InputIterator, typename OutputIterator, typename CountIterator>
761  const device_tag tag,
762  const uint32 n,
763  InputIterator in,
764  OutputIterator out,
765  CountIterator counts,
766  nvbio::vector<device_tag,uint8>& temp_storage)
767 {
768  return cuda::runlength_encode( n, in, out, counts, temp_storage );
769 };
770 
771 // device-wide run-length encode
772 //
773 // \param n number of input items
774 // \param keys_in a device input iterator
775 // \param values_in a device input iterator
776 // \param keys_out a device output iterator
777 // \param values_out a device output iterator
778 // \param reduction_op a reduction operator
779 // \param temp_storage some temporary storage
780 //
781 // \return the number of copied items
782 //
783 template <typename KeyIterator, typename ValueIterator, typename OutputKeyIterator, typename OutputValueIterator, typename ReductionOp>
785  const device_tag tag,
786  const uint32 n,
787  KeyIterator keys_in,
788  ValueIterator values_in,
789  OutputKeyIterator keys_out,
790  OutputValueIterator values_out,
791  ReductionOp reduction_op,
792  nvbio::vector<device_tag,uint8>& temp_storage)
793 {
794  return cuda::reduce_by_key(
795  n,
796  keys_in,
797  values_in,
798  keys_out,
799  values_out,
800  reduction_op,
801  temp_storage );
802 }
803 
804 #endif
805 
806 // system-wide copy of flagged items
807 //
808 // \param n number of input items
809 // \param in a device input iterator
810 // \param flags a device flags iterator
811 // \param out a device output iterator
812 // \param temp_storage some temporary storage
813 //
814 // \return the number of copied items
815 //
816 template <typename system_tag, typename InputIterator, typename FlagsIterator, typename OutputIterator>
818  const uint32 n,
819  InputIterator in,
820  FlagsIterator flags,
821  OutputIterator out,
822  nvbio::vector<system_tag,uint8>& temp_storage)
823 {
824  return copy_flagged( system_tag(), n, in, flags, out, temp_storage );
825 };
826 
827 // system-wide copy of predicated items
828 //
829 // \param n number of input items
830 // \param in a device input iterator
831 // \param out a device output iterator
832 // \param pred a unary predicate functor
833 // \param temp_storage some temporary storage
834 //
835 // \return the number of copied items
836 //
837 template <typename system_tag, typename InputIterator, typename OutputIterator, typename Predicate>
839  const uint32 n,
840  InputIterator in,
841  OutputIterator out,
842  const Predicate pred,
843  nvbio::vector<system_tag,uint8>& temp_storage)
844 {
845  return copy_if( system_tag(), n, in, out, pred, temp_storage );
846 };
847 
848 // system-wide run-length encode
849 //
850 // \param n number of input items
851 // \param in a system input iterator
852 // \param out a system output iterator
853 // \param counts a system output count iterator
854 // \param temp_storage some temporary storage
855 //
856 // \return the number of copied items
857 //
858 template <typename system_tag, typename InputIterator, typename OutputIterator, typename CountIterator>
860  const uint32 n,
861  InputIterator in,
862  OutputIterator out,
863  CountIterator counts,
864  nvbio::vector<system_tag,uint8>& temp_storage)
865 {
866  return runlength_encode( system_tag(), n, in, out, counts, temp_storage );
867 };
868 
869 // system-wide run-length encode
870 //
871 // \param n number of input items
872 // \param keys_in a system input iterator
873 // \param values_in a system input iterator
874 // \param keys_out a system output iterator
875 // \param values_out a system output iterator
876 // \param reduction_op a reduction operator
877 // \param temp_storage some temporary storage
878 //
879 // \return the number of copied items
880 //
881 template <typename system_tag, typename KeyIterator, typename ValueIterator, typename OutputKeyIterator, typename OutputValueIterator, typename ReductionOp>
883  const uint32 n,
884  KeyIterator keys_in,
885  ValueIterator values_in,
886  OutputKeyIterator keys_out,
887  OutputValueIterator values_out,
888  ReductionOp reduction_op,
889  nvbio::vector<system_tag,uint8>& temp_storage)
890 {
891  return reduce_by_key(
892  system_tag(),
893  n,
894  keys_in,
895  values_in,
896  keys_out,
897  values_out,
898  reduction_op,
899  temp_storage );
900 }
901 
902 // device-wide lower_bound
903 //
904 // \param n number of input items
905 // \param values a system input iterator of values to be searched
906 // \param keys a system input iterator of sorted keys
907 // \param indices a system output iterator
908 //
909 template <typename KeyIterator, typename ValueIterator, typename OutputIterator>
911  const device_tag tag,
912  const uint32 n,
913  ValueIterator values,
914  const uint32 n_keys,
915  KeyIterator keys,
916  OutputIterator indices)
917 {
919  keys, keys + n_keys,
920  values, values + n,
921  indices );
922 }
923 
924 // host-wide lower_bound
925 //
926 // \param n number of input items
927 // \param values a system input iterator of values to be searched
928 // \param keys a system input iterator of sorted keys
929 // \param indices a system output iterator
930 //
931 template <typename KeyIterator, typename ValueIterator, typename OutputIterator>
933  const host_tag tag,
934  const uint32 n,
935  ValueIterator values,
936  const uint32 n_keys,
937  KeyIterator keys,
938  OutputIterator indices)
939 {
940  #pragma omp parallel for
941  for (long i = 0; i < long(n); ++i)
942  indices[i] = uint32( lower_bound( values[i], keys, n_keys ) - keys );
943 }
944 
945 // system-wide lower_bound
946 //
947 // \param n number of input items
948 // \param values a system input iterator of values to be searched
949 // \param keys a system input iterator of sorted keys
950 // \param indices a system output iterator
951 //
952 template <typename system_tag, typename KeyIterator, typename ValueIterator, typename OutputIterator>
954  const uint32 n,
955  ValueIterator values,
956  const uint32 n_keys,
957  KeyIterator keys,
958  OutputIterator indices)
959 {
960  lower_bound(
961  system_tag(),
962  n,
963  values,
964  n_keys,
965  keys,
966  indices );
967 }
968 
969 // device-wide upper_bound
970 //
971 // \param n number of input items
972 // \param values a system input iterator of values to be searched
973 // \param keys a system input iterator of sorted keys
974 // \param indices a system output iterator
975 //
976 template <typename KeyIterator, typename ValueIterator, typename OutputIterator>
978  const device_tag tag,
979  const uint32 n,
980  ValueIterator values,
981  const uint32 n_keys,
982  KeyIterator keys,
983  OutputIterator indices)
984 {
986  keys, keys + n_keys,
987  values, values + n,
988  indices );
989 }
990 
991 // host-wide upper_bound
992 //
993 // \param n number of input items
994 // \param values a system input iterator of values to be searched
995 // \param keys a system input iterator of sorted keys
996 // \param indices a system output iterator
997 //
998 template <typename KeyIterator, typename ValueIterator, typename OutputIterator>
1000  const host_tag tag,
1001  const uint32 n,
1002  ValueIterator values,
1003  const uint32 n_keys,
1004  KeyIterator keys,
1005  OutputIterator indices)
1006 {
1007  #pragma omp parallel for
1008  for (long i = 0; i < long(n); ++i)
1009  indices[i] = uint32( upper_bound( values[i], keys, n_keys ) - keys );
1010 }
1011 
1012 // system-wide upper_bound
1013 //
1014 // \param n number of input items
1015 // \param values a system input iterator of values to be searched
1016 // \param keys a system input iterator of sorted keys
1017 // \param indices a system output iterator
1018 //
1019 template <typename system_tag, typename KeyIterator, typename ValueIterator, typename OutputIterator>
1021  const uint32 n,
1022  ValueIterator values,
1023  const uint32 n_keys,
1024  KeyIterator keys,
1025  OutputIterator indices)
1026 {
1027  upper_bound(
1028  system_tag(),
1029  n,
1030  values,
1031  n_keys,
1032  keys,
1033  indices );
1034 }
1035 
1036 #if defined(__CUDACC__)
1037 
1038 // device-wide sort
1039 //
1040 // \param n number of input items
1041 // \param keys a system input iterator of keys to be sorted
1042 //
1043 template <typename KeyIterator>
1044 void radix_sort(
1045  const device_tag tag,
1046  const uint32 n,
1047  KeyIterator keys,
1048  nvbio::vector<device_tag,uint8>& temp_storage)
1049 {
1050  typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
1051 
1052  cuda::alloc_temp_storage( temp_storage, 2 * n * sizeof(key_type) );
1053 
1054  key_type* keys_ptr = reinterpret_cast<key_type*>( raw_pointer( temp_storage ) );
1055 
1056  thrust::device_ptr<key_type> keys_buf( keys_ptr );
1057 
1058  thrust::copy( keys, keys + n, keys_buf );
1059 
1060  cuda::SortBuffers<key_type*> sort_buffers;
1061  sort_buffers.keys[0] = keys_ptr;
1062  sort_buffers.keys[1] = keys_ptr + n;
1063 
1064  cuda::SortEnactor sort_enactor;
1065  sort_enactor.sort( n, sort_buffers );
1066 
1067  thrust::copy(
1068  keys_buf + sort_buffers.selector * n,
1069  keys_buf + sort_buffers.selector * n + n,
1070  keys );
1071 }
1072 
1073 // device-wide sort by key
1074 //
1075 // \param n number of input items
1076 // \param keys a system input iterator of keys to be sorted
1077 // \param values a system input iterator of values to be sorted
1078 //
1079 template <typename KeyIterator, typename ValueIterator>
1080 void radix_sort(
1081  const device_tag tag,
1082  const uint32 n,
1083  KeyIterator keys,
1084  ValueIterator values,
1085  nvbio::vector<device_tag,uint8>& temp_storage)
1086 {
1087  typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
1088  typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
1089 
1090  const uint32 aligned_key_bytes = align<16>( 2 * n * sizeof(key_type) );
1091  const uint32 aligned_val_bytes = 2 * n * sizeof(value_type);
1092  cuda::alloc_temp_storage( temp_storage, aligned_key_bytes + aligned_val_bytes );
1093 
1094  key_type* keys_ptr = reinterpret_cast<key_type*>( raw_pointer( temp_storage ) );
1095  value_type* values_ptr = reinterpret_cast<value_type*>( raw_pointer( temp_storage ) + aligned_key_bytes );
1096 
1097  thrust::device_ptr<key_type> keys_buf( keys_ptr );
1098  thrust::device_ptr<key_type> values_buf( values_ptr );
1099 
1100  thrust::copy( keys, keys + n, keys_buf );
1101  thrust::copy( values, values + n, values_buf );
1102 
1103  cuda::SortBuffers<key_type*, value_type*> sort_buffers;
1104  sort_buffers.keys[0] = keys_ptr;
1105  sort_buffers.keys[1] = keys_ptr + n;
1106  sort_buffers.values[0] = values_ptr;
1107  sort_buffers.values[1] = values_ptr + n;
1108 
1109  cuda::SortEnactor sort_enactor;
1110  sort_enactor.sort( n, sort_buffers );
1111 
1112  thrust::copy(
1113  keys_buf + sort_buffers.selector * n,
1114  keys_buf + sort_buffers.selector * n + n,
1115  keys );
1116 
1117  thrust::copy(
1118  values_buf + sort_buffers.selector * n,
1119  values_buf + sort_buffers.selector * n + n,
1120  values );
1121 }
1122 
1123 #endif
1124 
1125 // host-wide sort
1126 //
1127 // \param n number of input items
1128 // \param keys a system input iterator of keys to be sorted
1129 //
1130 template <typename KeyIterator>
1132  const host_tag tag,
1133  const uint32 n,
1134  KeyIterator keys,
1135  nvbio::vector<host_tag,uint8>& temp_storage)
1136 {
1137  thrust::sort( keys, keys + n );
1138 }
1139 
1140 // system-wide sort
1141 //
1142 // \param n number of input items
1143 // \param keys a system input iterator of keys to be sorted
1144 //
1145 template <typename system_tag, typename KeyIterator>
1147  const uint32 n,
1148  KeyIterator keys,
1149  nvbio::vector<system_tag,uint8>& temp_storage)
1150 {
1151  radix_sort( system_tag(), n, keys, temp_storage );
1152 }
1153 
1154 // host-wide sort by key
1155 //
1156 // \param n number of input items
1157 // \param keys a system input iterator of keys to be sorted
1158 // \param values a system input iterator of values to be sorted
1159 //
1160 template <typename KeyIterator, typename ValueIterator>
1162  const host_tag tag,
1163  const uint32 n,
1164  KeyIterator keys,
1165  ValueIterator values,
1166  nvbio::vector<host_tag,uint8>& temp_storage)
1167 {
1168  thrust::sort_by_key( keys, keys + n, values, temp_storage );
1169 }
1170 
1171 // system-wide sort by key
1172 //
1173 // \param n number of input items
1174 // \param keys a system input iterator of keys to be sorted
1175 // \param values a system input iterator of values to be sorted
1176 //
1177 template <typename system_tag, typename KeyIterator, typename ValueIterator>
1179  const uint32 n,
1180  KeyIterator keys,
1181  ValueIterator values,
1182  nvbio::vector<system_tag,uint8>& temp_storage)
1183 {
1184  radix_sort( system_tag(), n, keys, values, temp_storage );
1185 }
1186 
1187 template <
1188  typename key_iterator1,
1189  typename key_iterator2>
1190 uint2 corank(
1191  const int32 i,
1192  const key_iterator1 A,
1193  const int32 m,
1194  const key_iterator2 B,
1195  const int32 n)
1196 {
1197  int32 j = min( i, m );
1198  int32 k = i - j;
1199 
1200  int32 j_lo = i >= n ? i - n : 0;
1201  int32 k_lo = 0;
1202 
1203  while (1)
1204  {
1205  if ((j > 0 || k < n) && A[j-1] > B[k])
1206  {
1207  // decrease j
1208  const int32 delta = util::divide_ri( j - j_lo, 2 );
1209  k_lo = k;
1210  j -= delta;
1211  k += delta;
1212  assert( j + k == i );
1213  }
1214  else if ((k > 0 || j < m) && B[k-1] >= A[j])
1215  {
1216  // decrease k
1217  const int32 delta = util::divide_ri( k - k_lo, 2 );
1218  j_lo = j;
1219  j += delta;
1220  k -= delta;
1221  assert( j + k == i );
1222  }
1223  else
1224  break;
1225  }
1226  return make_uint2( uint32(j), uint32(k) );
1227 }
1228 
1229 template <
1230  typename key_iterator1,
1231  typename key_iterator2,
1232  typename value_iterator1,
1233  typename value_iterator2,
1234  typename key_output,
1235  typename value_output>
1237  const host_tag tag,
1238  const uint32 A_len,
1239  const uint32 B_len,
1240  const key_iterator1 A_keys,
1241  const key_iterator2 B_keys,
1242  const value_iterator1 A_values,
1243  const value_iterator2 B_values,
1244  key_output C_keys,
1245  value_output C_values)
1246 {
1247  if (A_len == 0)
1248  {
1249  #pragma omp parallel for
1250  for (int32 i = 0; i < int32( B_len ); ++i)
1251  {
1252  C_keys[i] = A_keys[i];
1253  C_values[i] = A_values[i];
1254  }
1255  }
1256  else if (B_len == 0)
1257  {
1258  #pragma omp parallel for
1259  for (int32 i = 0; i < int32( A_len ); ++i)
1260  {
1261  C_keys[i] = A_keys[i];
1262  C_values[i] = A_values[i];
1263  }
1264  }
1265 
1266  const uint32 n_threads = (uint32)omp_get_num_procs();
1267 
1268  nvbio::vector<host_tag,uint32> A_diag( n_threads+1 );
1269  nvbio::vector<host_tag,uint32> B_diag( n_threads+1 );
1270 
1271  const uint32 C_len = A_len + B_len;
1272 
1273  A_diag[ n_threads ] = 0;
1274  B_diag[ n_threads ] = 0;
1275  A_diag[ n_threads ] = A_len;
1276  B_diag[ n_threads ] = B_len;
1277 
1278  const uint32 n_partition = util::divide_ri( C_len, n_threads );
1279 
1280  #pragma omp parallel for num_threads(n_threads)
1281  for (int32 i = 1; i < int32( n_threads ); ++i)
1282  {
1283  const int32 index = i * n_partition;
1284 
1285  const uint2 jk = corank( index, A_keys, A_len, B_keys, B_len );
1286 
1287  A_diag[i] = jk.x;
1288  B_diag[i] = jk.y;
1289  }
1290 
1291  #pragma omp parallel for num_threads(n_threads)
1292  for (int32 i = 0; i < int32( n_threads ); ++i)
1293  {
1295  A_keys + A_diag[i],
1296  A_keys + A_diag[i+1],
1297  B_keys + B_diag[i],
1298  B_keys + B_diag[i+1],
1299  A_values + A_diag[i],
1300  B_values + B_diag[i],
1301  C_keys + i * n_partition,
1302  C_values + i * n_partition );
1303  }
1304 /* for (uint32 i = 1; i < C_len; ++i)
1305  {
1306  if (C_keys[i-1] > C_keys[i])
1307  {
1308  fprintf(stderr, "merging error at %u: %llu, %llu\n", i, C_keys[i-1], C_keys[i] );
1309  exit(1);
1310  }
1311  }*/
1312 }
1313 
1314 template <
1315  typename key_iterator1,
1316  typename key_iterator2,
1317  typename value_iterator1,
1318  typename value_iterator2,
1319  typename key_output,
1320  typename value_output>
1322  const device_tag tag,
1323  const uint32 A_len,
1324  const uint32 B_len,
1325  const key_iterator1 A_keys,
1326  const key_iterator2 B_keys,
1327  const value_iterator1 A_values,
1328  const value_iterator2 B_values,
1329  key_output C_keys,
1330  value_output C_values)
1331 {
1333  A_keys,
1334  A_keys + A_len,
1335  B_keys,
1336  B_keys + A_len,
1337  A_values,
1338  B_values,
1339  C_keys,
1340  C_values );
1341 }
1342 
1343 template <
1344  typename system_tag,
1345  typename key_iterator1,
1346  typename key_iterator2,
1347  typename value_iterator1,
1348  typename value_iterator2,
1349  typename key_output,
1350  typename value_output>
1352  const uint32 A_len,
1353  const uint32 B_len,
1354  const key_iterator1 A_keys,
1355  const key_iterator2 B_keys,
1356  const value_iterator1 A_values,
1357  const value_iterator2 B_values,
1358  key_output C_keys,
1359  value_output C_values,
1360  nvbio::vector<system_tag,uint8>& temp_storage)
1361 {
1362  merge_by_key(
1363  system_tag(),
1364  A_len,
1365  B_len,
1366  A_keys,
1367  B_keys,
1368  A_values,
1369  B_values,
1370  C_keys,
1371  C_values );
1372 }
1373 
1374 #if defined(__CUDACC__)
1375 
1378 template <typename iterator_type, typename functor_type>
1379 __global__
1380 void for_each_kernel(const uint64 n, const iterator_type in, const functor_type f)
1381 {
1382  const uint32 grid_size = blockDim.x * gridDim.x;
1383 
1384  for (uint64 i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += grid_size)
1385  f( in[i] );
1386 };
1387 
1388 #endif
1389 
1390 // ask the optimizer how many blocks we should try using next
1391 //
1392 template <typename KernelFunction>
1393 uint32 for_each_enactor<device_tag>::suggested_blocks(KernelFunction kernel, const uint32 cta_size) const
1394 {
1395 #if defined(__CUDACC__)
1396  if (m_blocks_hi == 0)
1398  else if (m_blocks_lo == 0)
1399  return cuda::multiprocessor_count();
1400  else
1401  return cuda::multiprocessor_count() * (m_blocks_lo + m_blocks_hi) / 2;
1402 #else
1403  return 0u;
1404 #endif
1405 }
1406 
1407 // update the optimizer's internal state with the latest speed data-point
1408 //
1409 inline
1410 void for_each_enactor<device_tag>::update(const uint32 n_blocks, const float speed)
1411 {
1412 #if defined(__CUDACC__)
1413  // carry out a little binary search over the best number of blocks/SM
1414  if (m_blocks_hi == 0)
1415  {
1416  m_blocks_hi = n_blocks / cuda::multiprocessor_count();
1417  m_speed_hi = speed;
1418  }
1419  else if (m_blocks_lo == 0)
1420  {
1421  m_blocks_lo = n_blocks / cuda::multiprocessor_count();
1422  m_speed_lo = speed;
1423  }
1424  else if (m_speed_lo > m_speed_hi)
1425  {
1426  m_blocks_hi = n_blocks / cuda::multiprocessor_count();
1427  m_speed_hi = speed;
1428  }
1429  else
1430  {
1431  m_blocks_lo = n_blocks / cuda::multiprocessor_count();
1432  m_speed_lo = speed;
1433  }
1434  // TODO: once the optimizer settles to a given value, it will never change:
1435  // we should explore using occasional "mutations" to adapt to possibly
1436  // changing conditions...
1437 #endif
1438 }
1439 
1440 // enact the for_each
1441 //
1442 template <typename Iterator, typename Functor>
1444  const uint64 n,
1445  const Iterator in,
1446  Functor functor)
1447 {
1448 #if defined(__CUDACC__)
1449  const uint32 blockdim = 128;
1450  const uint32 n_blocks = suggested_blocks( for_each_kernel<Iterator,Functor>, blockdim );
1451 
1452  cuda::Timer timer;
1453  timer.start();
1454 
1455  for_each_kernel<<<n_blocks,blockdim>>>( n, in, functor );
1456 
1457  timer.stop();
1458 
1459  update( n_blocks, float(n) / timer.seconds() );
1460 #endif
1461 }
1462 
1463 } // namespace nvbio