34 template <
typename PredicateIterator>
38 const PredicateIterator pred)
44 thrust::logical_or<bool>() );
49 template <
typename PredicateIterator>
53 const PredicateIterator pred)
59 thrust::logical_and<bool>() );
62 #if defined(__CUDACC__) 66 template <
typename PredicateIterator>
70 const PredicateIterator pred)
77 template <
typename PredicateIterator>
81 const PredicateIterator pred)
90 template <
typename system_tag,
typename PredicateIterator>
93 const PredicateIterator pred)
95 return any( system_tag(), n, pred );
100 template <
typename system_tag,
typename PredicateIterator>
103 const PredicateIterator pred)
105 return all( system_tag(), n, pred );
110 template <
typename Iterator1,
typename Iterator2>
113 typedef bool value_type;
114 typedef value_type& reference;
115 typedef value_type const_reference;
116 typedef value_type* pointer;
117 typedef typename std::iterator_traits<Iterator1>::difference_type difference_type;
118 typedef typename std::iterator_traits<Iterator1>::iterator_category iterator_category;
121 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
122 is_sorted_iterator(
const Iterator1 _it1,
const Iterator2 _it2) : it1( _it1 ), it2( _it2 ) {}
125 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
126 bool operator[] (
const uint64 i)
const {
return it1[i] <= it2[i]; }
129 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
130 bool operator* ()
const {
return it1[0] <= it2[0]; }
133 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
134 is_sorted_iterator& operator++ () { ++it1; ++it2;
return *
this; }
141 template <
typename T1,
typename T2>
142 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
148 template <
typename T1,
typename T2>
149 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
152 return it1.it1 - it2.it1;
155 template <
typename T1,
typename T2>
156 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
159 return it1.it1 != it2.it1;
162 template <
typename T1,
typename T2>
163 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
166 return it1.it1 == it2.it1;
171 template <
typename Iterator1,
typename Iterator2,
typename Headflags>
174 typedef bool value_type;
175 typedef value_type& reference;
176 typedef value_type const_reference;
177 typedef value_type* pointer;
178 typedef typename std::iterator_traits<Iterator1>::difference_type difference_type;
179 typedef typename std::iterator_traits<Iterator1>::iterator_category iterator_category;
182 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
183 is_segment_sorted_iterator(
const Iterator1 _it1,
const Iterator2 _it2,
const Headflags _hd) : it1( _it1 ), it2( _it2 ), hd(_hd) {}
186 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
187 bool operator[] (
const uint64 i)
const {
return hd[i] || (it1[i] <= it2[i]); }
190 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
191 bool operator* ()
const {
return hd[0] || (it1[0] <= it2[0]); }
194 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
195 is_segment_sorted_iterator& operator++ () { ++it1; ++it2; ++hd;
return *
this; }
203 template <
typename T1,
typename T2,
typename H>
204 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
210 template <
typename T1,
typename T2,
typename H>
211 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
214 return it1.it1 - it2.it1;
217 template <
typename T1,
typename T2,
typename H>
218 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
221 return it1.it1 != it2.it1;
224 template <
typename T1,
typename T2,
typename H>
225 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
228 return it1.it1 == it2.it1;
233 template <
typename system_tag,
typename Iterator>
236 const Iterator values)
244 template <
typename system_tag,
typename Iterator,
typename Headflags>
247 const Iterator values,
248 const Headflags flags)
255 template <
typename Iterator,
typename Functor>
263 #pragma omp parallel for if (n >= 256) 265 for (int64 i = 0; i < int64(n); ++i)
271 template <
typename Iterator,
typename Functor>
283 template <
typename system_tag,
typename Iterator,
typename Functor>
289 return for_each( system_tag(), n, in, functor );
294 template <
typename Iterator,
typename Output,
typename Functor>
300 const Functor functor)
307 template <
typename Iterator,
typename Output,
typename Functor>
313 const Functor functor)
316 #pragma omp parallel for if (n >= 256) 318 for (int64 i = 0; i < int64(n); ++i)
319 out[i] = functor( in[i] );
324 template <
typename Iterator1,
typename Iterator2,
typename Output,
typename Functor>
331 const Functor functor)
338 template <
typename Iterator1,
typename Iterator2,
typename Output,
typename Functor>
345 const Functor functor)
348 #pragma omp parallel for if (n >= 256) 350 for (int64 i = 0; i < int64(n); ++i)
351 out[i] = functor( in1[i], in2[i] );
356 template <
typename system_tag,
typename Iterator,
typename Output,
typename Functor>
361 const Functor functor)
363 transform( system_tag(), n, in, out, functor );
368 template <
typename system_tag,
typename Iterator1,
typename Iterator2,
typename Output,
typename Functor>
374 const Functor functor)
376 transform( system_tag(), n, in1, in2, out, functor );
386 template <
typename InputIterator,
typename BinaryOp>
387 typename std::iterator_traits<InputIterator>::value_type
reduce(
405 template <
typename InputIterator,
typename OutputIterator,
typename BinaryOp>
430 template <
typename InputIterator,
typename OutputIterator,
typename BinaryOp,
typename Identity>
448 #if defined(__CUDACC__) 457 template <
typename InputIterator,
typename BinaryOp>
458 typename std::iterator_traits<InputIterator>::value_type
reduce(
476 template <
typename InputIterator,
typename OutputIterator,
typename BinaryOp>
497 template <
typename InputIterator,
typename OutputIterator,
typename BinaryOp,
typename Identity>
519 template <
typename system_tag,
typename InputIterator,
typename BinaryOp>
520 typename std::iterator_traits<InputIterator>::value_type
reduce(
542 template <
typename system_tag,
typename InputIterator,
typename OutputIterator,
typename BinaryOp>
568 template <
typename system_tag,
typename InputIterator,
typename OutputIterator,
typename BinaryOp,
typename Identity>
597 template <
typename InputIterator,
typename FlagsIterator,
typename OutputIterator>
624 template <
typename InputIterator,
typename OutputIterator,
typename Predicate>
630 const Predicate pred,
650 template <
typename InputIterator,
typename OutputIterator,
typename CountIterator>
656 CountIterator counts,
662 thrust::make_constant_iterator<uint32>( 1u ),
664 counts ).first - out );
680 template <
typename KeyIterator,
typename ValueIterator,
typename OutputKeyIterator,
typename OutputValueIterator,
typename ReductionOp>
685 ValueIterator values_in,
686 OutputKeyIterator keys_out,
687 OutputValueIterator values_out,
688 ReductionOp reduction_op,
691 typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
700 reduction_op ).first - keys_out );
703 #if defined(__CUDACC__) 715 template <
typename InputIterator,
typename FlagsIterator,
typename OutputIterator>
737 template <
typename InputIterator,
typename OutputIterator,
typename Predicate>
743 const Predicate pred,
759 template <
typename InputIterator,
typename OutputIterator,
typename CountIterator>
765 CountIterator counts,
783 template <
typename KeyIterator,
typename ValueIterator,
typename OutputKeyIterator,
typename OutputValueIterator,
typename ReductionOp>
788 ValueIterator values_in,
789 OutputKeyIterator keys_out,
790 OutputValueIterator values_out,
791 ReductionOp reduction_op,
816 template <
typename system_tag,
typename InputIterator,
typename FlagsIterator,
typename OutputIterator>
824 return copy_flagged( system_tag(), n, in, flags, out, temp_storage );
837 template <
typename system_tag,
typename InputIterator,
typename OutputIterator,
typename Predicate>
842 const Predicate pred,
845 return copy_if( system_tag(), n, in, out, pred, temp_storage );
858 template <
typename system_tag,
typename InputIterator,
typename OutputIterator,
typename CountIterator>
863 CountIterator counts,
881 template <
typename system_tag,
typename KeyIterator,
typename ValueIterator,
typename OutputKeyIterator,
typename OutputValueIterator,
typename ReductionOp>
885 ValueIterator values_in,
886 OutputKeyIterator keys_out,
887 OutputValueIterator values_out,
888 ReductionOp reduction_op,
902 #if defined(__CUDACC__) 905 template <
typename KeyIterator,
typename ValueIterator,
typename OutputIterator>
907 void lower_bound_kernel(
909 ValueIterator values,
912 OutputIterator indices)
914 const uint32 i = threadIdx.x + blockIdx.x * blockDim.x;
921 template <
typename KeyIterator,
typename ValueIterator,
typename OutputIterator>
923 void upper_bound_kernel(
925 ValueIterator values,
928 OutputIterator indices)
930 const uint32 i = threadIdx.x + blockIdx.x * blockDim.x;
943 template <
typename KeyIterator,
typename ValueIterator,
typename OutputIterator>
947 ValueIterator values,
950 OutputIterator indices)
957 #elif defined(__CUDACC__) 958 const uint32 blockdim = 128;
959 const uint32 n_blocks =
divide_ri( n, blockdim );
961 lower_bound_kernel<<<n_blocks,blockdim>>>( n, values, n_keys, keys, indices );
972 template <
typename KeyIterator,
typename ValueIterator,
typename OutputIterator>
976 ValueIterator values,
979 OutputIterator indices)
981 #pragma omp parallel for 982 for (
long i = 0; i < long(n); ++i)
983 indices[i] = uint32(
lower_bound( values[i], keys, n_keys ) - keys );
993 template <
typename system_tag,
typename KeyIterator,
typename ValueIterator,
typename OutputIterator>
996 ValueIterator values,
999 OutputIterator indices)
1017 template <
typename KeyIterator,
typename ValueIterator,
typename OutputIterator>
1021 ValueIterator values,
1022 const uint32 n_keys,
1024 OutputIterator indices)
1028 keys, keys + n_keys,
1031 #elif defined(__CUDACC__) 1032 const uint32 blockdim = 128;
1033 const uint32 n_blocks =
divide_ri( n, blockdim );
1035 upper_bound_kernel<<<n_blocks,blockdim>>>( n, values, n_keys, keys, indices );
1046 template <
typename KeyIterator,
typename ValueIterator,
typename OutputIterator>
1050 ValueIterator values,
1051 const uint32 n_keys,
1053 OutputIterator indices)
1055 #pragma omp parallel for 1056 for (
long i = 0; i < long(n); ++i)
1057 indices[i] = uint32(
upper_bound( values[i], keys, n_keys ) - keys );
1067 template <
typename system_tag,
typename KeyIterator,
typename ValueIterator,
typename OutputIterator>
1070 ValueIterator values,
1071 const uint32 n_keys,
1073 OutputIterator indices)
1084 #if defined(__CUDACC__) 1091 template <
typename KeyIterator>
1098 typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
1102 key_type* keys_ptr =
reinterpret_cast<key_type*
>(
raw_pointer( temp_storage ) );
1104 thrust::device_ptr<key_type> keys_buf( keys_ptr );
1106 thrust::copy( keys, keys + n, keys_buf );
1109 sort_buffers.keys[0] = keys_ptr;
1110 sort_buffers.keys[1] = keys_ptr + n;
1113 sort_enactor.sort( n, sort_buffers );
1116 keys_buf + sort_buffers.selector * n,
1117 keys_buf + sort_buffers.selector * n + n,
1127 template <
typename KeyIterator,
typename ValueIterator>
1132 ValueIterator values,
1135 typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
1136 typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
1138 const uint32 aligned_key_bytes = align<16>( 2 * n *
sizeof(key_type) );
1139 const uint32 aligned_val_bytes = 2 * n *
sizeof(value_type);
1142 key_type* keys_ptr =
reinterpret_cast<key_type*
>(
raw_pointer( temp_storage ) );
1143 value_type* values_ptr =
reinterpret_cast<value_type*
>(
raw_pointer( temp_storage ) + aligned_key_bytes );
1145 thrust::device_ptr<key_type> keys_buf( keys_ptr );
1146 thrust::device_ptr<key_type> values_buf( values_ptr );
1148 thrust::copy( keys, keys + n, keys_buf );
1149 thrust::copy( values, values + n, values_buf );
1152 sort_buffers.keys[0] = keys_ptr;
1153 sort_buffers.keys[1] = keys_ptr + n;
1154 sort_buffers.values[0] = values_ptr;
1155 sort_buffers.values[1] = values_ptr + n;
1158 sort_enactor.sort( n, sort_buffers );
1161 keys_buf + sort_buffers.selector * n,
1162 keys_buf + sort_buffers.selector * n + n,
1166 values_buf + sort_buffers.selector * n,
1167 values_buf + sort_buffers.selector * n + n,
1178 template <
typename KeyIterator>
1185 thrust::sort( keys, keys + n );
1193 template <
typename system_tag,
typename KeyIterator>
1199 radix_sort( system_tag(), n, keys, temp_storage );
1208 template <
typename KeyIterator,
typename ValueIterator>
1213 ValueIterator values,
1216 thrust::sort_by_key( keys, keys + n, values, temp_storage );
1225 template <
typename system_tag,
typename KeyIterator,
typename ValueIterator>
1229 ValueIterator values,
1232 radix_sort( system_tag(), n, keys, values, temp_storage );
1236 typename key_iterator1,
1237 typename key_iterator2>
1241 const key_iterator1 A,
1243 const key_iterator2 B,
1246 int32 j = min( i, m );
1249 int32 j_lo = i >= n ? i - n : 0;
1254 if ((j > 0 || k < n) && A[j-1] > B[k])
1257 const int32 delta =
divide_ri( j - j_lo, 2 );
1261 assert( j + k == i );
1263 else if ((k > 0 || j < m) && B[k-1] >= A[j])
1266 const int32 delta =
divide_ri( k - k_lo, 2 );
1270 assert( j + k == i );
1275 return make_uint2( uint32(j), uint32(k) );
1279 typename key_iterator1,
1280 typename key_iterator2,
1281 typename value_iterator1,
1282 typename value_iterator2,
1283 typename key_output,
1284 typename value_output>
1289 const key_iterator1 A_keys,
1290 const key_iterator2 B_keys,
1291 const value_iterator1 A_values,
1292 const value_iterator2 B_values,
1294 value_output C_values)
1298 #pragma omp parallel for 1299 for (int32 i = 0; i < int32( B_len ); ++i)
1301 C_keys[i] = A_keys[i];
1302 C_values[i] = A_values[i];
1305 else if (B_len == 0)
1307 #pragma omp parallel for 1308 for (int32 i = 0; i < int32( A_len ); ++i)
1310 C_keys[i] = A_keys[i];
1311 C_values[i] = A_values[i];
1315 const uint32 n_threads = (uint32)omp_get_num_procs();
1320 const uint32 C_len = A_len + B_len;
1322 A_diag[ n_threads ] = 0;
1323 B_diag[ n_threads ] = 0;
1324 A_diag[ n_threads ] = A_len;
1325 B_diag[ n_threads ] = B_len;
1327 const uint32 n_partition =
divide_ri( C_len, n_threads );
1329 #pragma omp parallel for num_threads(n_threads) 1330 for (int32 i = 1; i < int32( n_threads ); ++i)
1332 const int32 index = i * n_partition;
1334 const uint2 jk = corank( index, A_keys, A_len, B_keys, B_len );
1340 #pragma omp parallel for num_threads(n_threads) 1341 for (int32 i = 0; i < int32( n_threads ); ++i)
1345 A_keys + A_diag[i+1],
1347 B_keys + B_diag[i+1],
1348 A_values + A_diag[i],
1349 B_values + B_diag[i],
1350 C_keys + i * n_partition,
1351 C_values + i * n_partition );
1364 typename key_iterator1,
1365 typename key_iterator2,
1366 typename value_iterator1,
1367 typename value_iterator2,
1368 typename key_output,
1369 typename value_output>
1374 const key_iterator1 A_keys,
1375 const key_iterator2 B_keys,
1376 const value_iterator1 A_values,
1377 const value_iterator2 B_values,
1379 value_output C_values)
1393 typename system_tag,
1394 typename key_iterator1,
1395 typename key_iterator2,
1396 typename value_iterator1,
1397 typename value_iterator2,
1398 typename key_output,
1399 typename value_output>
1403 const key_iterator1 A_keys,
1404 const key_iterator2 B_keys,
1405 const value_iterator1 A_values,
1406 const value_iterator2 B_values,
1408 value_output C_values,
1423 #if defined(__CUDACC__) 1427 template <
typename iterator_type,
typename functor_type>
1429 void for_each_kernel(
const uint64 n,
const iterator_type in,
const functor_type f)
1431 const uint32 grid_size = blockDim.x * gridDim.x;
1433 for (uint64 i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += grid_size)
1441 template <
typename KernelFunction>
1444 #if defined(__CUDACC__) 1445 if (m_blocks_hi == 0)
1446 return cuda::multiprocessor_count() * cuda::max_active_blocks_per_multiprocessor( kernel, cta_size, 0u );
1447 else if (m_blocks_lo == 0)
1448 return cuda::multiprocessor_count();
1450 return cuda::multiprocessor_count() * (m_blocks_lo + m_blocks_hi) / 2;
1461 #if defined(__CUDACC__) 1463 if (m_blocks_hi == 0)
1465 m_blocks_hi = uint32(n_blocks / cuda::multiprocessor_count());
1468 else if (m_blocks_lo == 0)
1470 m_blocks_lo = uint32(n_blocks / cuda::multiprocessor_count());
1473 else if (m_speed_lo > m_speed_hi)
1475 m_blocks_hi = uint32(n_blocks / cuda::multiprocessor_count());
1480 m_blocks_lo = uint32(n_blocks / cuda::multiprocessor_count());
1491 template <
typename Iterator,
typename Functor>
1497 #if defined(__CUDACC__) 1498 const uint32 blockdim = 128;
1499 const uint32 n_blocks = suggested_blocks( for_each_kernel<Iterator,Functor>, blockdim );
1504 for_each_kernel<<<n_blocks,blockdim>>>( n, in, functor );
1508 update( n_blocks,
float(n) / timer.
seconds() );
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE index_type upper_bound_index(const Value x, Iterator begin, const index_type n)
Definition: algorithms.h:193
void radix_sort(const uint32 n, KeyIterator keys, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:1194
void transform(const uint32 n, const Iterator in, const Output out, const Functor functor)
Definition: primitives_inl.h:357
Definition: primitives_inl.h:111
std::iterator_traits< InputIterator >::value_type reduce(const uint32 n, InputIterator d_in, BinaryOp op, thrust::device_vector< uint8 > &d_temp_storage)
Definition: primitives_inl.h:185
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE void merge_by_key(key_iterator1 first1, key_iterator1 end1, key_iterator2 first2, key_iterator2 end2, value_iterator1 values1, value_iterator2 values2, key_iterator output_keys, value_iterator output_values)
Definition: algorithms.h:273
bool any(const uint32 n, const PredicateIterator pred)
Definition: primitives_inl.h:91
uint32 copy_if(const uint32 n, InputIterator d_in, OutputIterator d_out, const Predicate pred, thrust::device_vector< uint8 > &d_temp_storage)
Definition: primitives_inl.h:349
uint32 runlength_encode(const uint32 n, InputIterator d_in, OutputIterator d_out, CountIterator d_counts, thrust::device_vector< uint8 > &d_temp_storage)
Definition: primitives_inl.h:392
void alloc_temp_storage(VectorType &vec, const uint64 size)
Definition: primitives_inl.h:39
T * raw_pointer(thrust::device_vector< T, Alloc > &vec)
Definition: thrust_view.h:69
bool is_sorted(const uint32 n, const Iterator values)
Definition: primitives_inl.h:234
Definition: primitives_inl.h:172
bool all(const uint32 n, const PredicateIterator pred)
Definition: primitives_inl.h:107
uint32 runlength_encode(const uint32 n, InputIterator in, OutputIterator out, CountIterator counts, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:859
CUGAR_HOST_DEVICE L divide_ri(const L x, const R y)
Definition: numbers.h:180
void start()
Definition: timer.h:103
void inclusive_scan(const uint32 n, InputIterator d_in, OutputIterator d_out, BinaryOp op, thrust::device_vector< uint8 > &d_temp_storage)
Definition: primitives_inl.h:228
uint32 reduce_by_key(const uint32 n, KeyIterator d_keys_in, ValueIterator d_values_in, OutputKeyIterator d_keys_out, OutputValueIterator d_values_out, ReductionOp reduction_op, thrust::device_vector< uint8 > &d_temp_storage)
Definition: primitives_inl.h:438
bool all(const uint32 n, const PredicateIterator pred)
Definition: primitives_inl.h:101
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE Iterator upper_bound(const Value x, Iterator begin, const index_type n)
Definition: algorithms.h:138
bool any(const uint32 n, const PredicateIterator pred)
Definition: primitives_inl.h:91
bool is_segment_sorted(const uint32 n, const Iterator values, const Headflags flags)
Definition: primitives_inl.h:245
uint32 copy_flagged(const uint32 n, InputIterator d_in, FlagsIterator d_flags, OutputIterator d_out, thrust::device_vector< uint8 > &d_temp_storage)
Definition: primitives_inl.h:306
void exclusive_scan(const uint32 n, InputIterator d_in, OutputIterator d_out, BinaryOp op, Identity identity, thrust::device_vector< uint8 > &d_temp_storage)
Definition: primitives_inl.h:265
void exclusive_scan(const uint32 n, InputIterator in, OutputIterator out, BinaryOp op, Identity identity, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:569
uint32 reduce_by_key(const uint32 n, KeyIterator keys_in, ValueIterator values_in, OutputKeyIterator keys_out, OutputValueIterator values_out, ReductionOp reduction_op, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:882
Define a vector_view POD type and plain_view() for std::vector.
Definition: diff.h:38
Definition: primitives.h:371
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE index_type lower_bound_index(const Value x, Iterator begin, const index_type n)
Definition: algorithms.h:179
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE Iterator lower_bound(const Value x, Iterator begin, const index_type n)
Definition: algorithms.h:99
uint32 copy_if(const uint32 n, InputIterator in, OutputIterator out, const Predicate pred, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:838
float seconds() const
Definition: timer.h:118
std::iterator_traits< InputIterator >::value_type reduce(const uint32 n, InputIterator in, BinaryOp op, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:520
void inclusive_scan(const uint32 n, InputIterator in, OutputIterator out, BinaryOp op, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:543
void stop()
Definition: timer.h:110
void for_each(const uint64 n, const Iterator in, Functor functor)
Definition: primitives_inl.h:284
uint32 copy_flagged(const uint32 n, InputIterator in, FlagsIterator flags, OutputIterator out, cugar::vector< system_tag, uint8 > &temp_storage)
Definition: primitives_inl.h:817
Definition: functors.h:214
Definition: functors.h:754
void operator()(const uint64 n, const Iterator in, Functor functor)
Definition: primitives.h:376