34 template <
typename PredicateIterator>
38 const PredicateIterator pred)
44 thrust::logical_or<bool>() );
49 template <
typename PredicateIterator>
53 const PredicateIterator pred)
59 thrust::logical_and<bool>() );
62 #if defined(__CUDACC__)
66 template <
typename PredicateIterator>
70 const PredicateIterator pred)
77 template <
typename PredicateIterator>
81 const PredicateIterator pred)
90 template <
typename system_tag,
typename PredicateIterator>
93 const PredicateIterator pred)
95 return any( system_tag(), n, pred );
100 template <
typename system_tag,
typename PredicateIterator>
103 const PredicateIterator pred)
105 return all( system_tag(), n, pred );
110 template <
typename Iterator1,
typename Iterator2>
141 template <
typename T1,
typename T2>
148 template <
typename T1,
typename T2>
155 template <
typename T1,
typename T2>
159 return it1.
it1 != it2.
it1;
162 template <
typename T1,
typename T2>
166 return it1.
it1 == it2.
it1;
171 template <
typename Iterator1,
typename Iterator2,
typename Headflags>
203 template <
typename T1,
typename T2,
typename H>
210 template <
typename T1,
typename T2,
typename H>
217 template <
typename T1,
typename T2,
typename H>
221 return it1.
it1 != it2.
it1;
224 template <
typename T1,
typename T2,
typename H>
228 return it1.
it1 == it2.
it1;
233 template <
typename system_tag,
typename Iterator>
236 const Iterator values)
244 template <
typename system_tag,
typename Iterator,
typename Headflags>
247 const Iterator values,
248 const Headflags flags)
255 template <
typename Iterator,
typename Functor>
263 #pragma omp parallel for if (n >= 256)
271 template <
typename Iterator,
typename Functor>
283 template <
typename system_tag,
typename Iterator,
typename Functor>
289 return for_each( system_tag(), n, in, functor );
294 template <
typename Iterator,
typename Output,
typename Functor>
300 const Functor functor)
307 template <
typename Iterator,
typename Output,
typename Functor>
313 const Functor functor)
316 #pragma omp parallel for if (n >= 256)
319 out[i] = functor( in[i] );
324 template <
typename Iterator1,
typename Iterator2,
typename Output,
typename Functor>
331 const Functor functor)
338 template <
typename Iterator1,
typename Iterator2,
typename Output,
typename Functor>
345 const Functor functor)
348 #pragma omp parallel for if (n >= 256)
351 out[i] = functor( in1[i], in2[i] );
356 template <
typename system_tag,
typename Iterator,
typename Output,
typename Functor>
361 const Functor functor)
363 transform( system_tag(), n, in, out, functor );
368 template <
typename system_tag,
typename Iterator1,
typename Iterator2,
typename Output,
typename Functor>
374 const Functor functor)
376 transform( system_tag(), n, in1, in2, out, functor );
386 template <
typename InputIterator,
typename BinaryOp>
387 typename std::iterator_traits<InputIterator>::value_type
reduce(
405 template <
typename InputIterator,
typename OutputIterator,
typename BinaryOp>
430 template <
typename InputIterator,
typename OutputIterator,
typename BinaryOp,
typename Identity>
448 #if defined(__CUDACC__)
457 template <
typename InputIterator,
typename BinaryOp>
458 typename std::iterator_traits<InputIterator>::value_type
reduce(
476 template <
typename InputIterator,
typename OutputIterator,
typename BinaryOp>
497 template <
typename InputIterator,
typename OutputIterator,
typename BinaryOp,
typename Identity>
519 template <
typename system_tag,
typename InputIterator,
typename BinaryOp>
520 typename std::iterator_traits<InputIterator>::value_type
reduce(
542 template <
typename system_tag,
typename InputIterator,
typename OutputIterator,
typename BinaryOp>
568 template <
typename system_tag,
typename InputIterator,
typename OutputIterator,
typename BinaryOp,
typename Identity>
597 template <
typename InputIterator,
typename FlagsIterator,
typename OutputIterator>
624 template <
typename InputIterator,
typename OutputIterator,
typename Predicate>
630 const Predicate pred,
650 template <
typename InputIterator,
typename OutputIterator,
typename CountIterator>
656 CountIterator counts,
662 thrust::make_constant_iterator<uint32>( 1u ),
664 counts ).first - out );
680 template <
typename KeyIterator,
typename ValueIterator,
typename OutputKeyIterator,
typename OutputValueIterator,
typename ReductionOp>
685 ValueIterator values_in,
686 OutputKeyIterator keys_out,
687 OutputValueIterator values_out,
688 ReductionOp reduction_op,
691 typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
700 reduction_op ).first - keys_out );
703 #if defined(__CUDACC__)
715 template <
typename InputIterator,
typename FlagsIterator,
typename OutputIterator>
717 const device_tag tag,
737 template <
typename InputIterator,
typename OutputIterator,
typename Predicate>
739 const device_tag tag,
743 const Predicate pred,
759 template <
typename InputIterator,
typename OutputIterator,
typename CountIterator>
761 const device_tag tag,
765 CountIterator counts,
783 template <
typename KeyIterator,
typename ValueIterator,
typename OutputKeyIterator,
typename OutputValueIterator,
typename ReductionOp>
785 const device_tag tag,
788 ValueIterator values_in,
789 OutputKeyIterator keys_out,
790 OutputValueIterator values_out,
791 ReductionOp reduction_op,
816 template <
typename system_tag,
typename InputIterator,
typename FlagsIterator,
typename OutputIterator>
824 return copy_flagged( system_tag(), n, in, flags, out, temp_storage );
837 template <
typename system_tag,
typename InputIterator,
typename OutputIterator,
typename Predicate>
842 const Predicate pred,
845 return copy_if( system_tag(), n, in, out, pred, temp_storage );
858 template <
typename system_tag,
typename InputIterator,
typename OutputIterator,
typename CountIterator>
863 CountIterator counts,
881 template <
typename system_tag,
typename KeyIterator,
typename ValueIterator,
typename OutputKeyIterator,
typename OutputValueIterator,
typename ReductionOp>
885 ValueIterator values_in,
886 OutputKeyIterator keys_out,
887 OutputValueIterator values_out,
888 ReductionOp reduction_op,
909 template <
typename KeyIterator,
typename ValueIterator,
typename OutputIterator>
913 ValueIterator values,
916 OutputIterator indices)
931 template <
typename KeyIterator,
typename ValueIterator,
typename OutputIterator>
935 ValueIterator values,
938 OutputIterator indices)
940 #pragma omp parallel for
941 for (
long i = 0; i < long(n); ++i)
952 template <
typename system_tag,
typename KeyIterator,
typename ValueIterator,
typename OutputIterator>
955 ValueIterator values,
958 OutputIterator indices)
976 template <
typename KeyIterator,
typename ValueIterator,
typename OutputIterator>
980 ValueIterator values,
983 OutputIterator indices)
998 template <
typename KeyIterator,
typename ValueIterator,
typename OutputIterator>
1002 ValueIterator values,
1005 OutputIterator indices)
1007 #pragma omp parallel for
1008 for (
long i = 0; i < long(n); ++i)
1019 template <
typename system_tag,
typename KeyIterator,
typename ValueIterator,
typename OutputIterator>
1022 ValueIterator values,
1025 OutputIterator indices)
1036 #if defined(__CUDACC__)
1043 template <
typename KeyIterator>
1045 const device_tag tag,
1050 typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
1054 key_type* keys_ptr =
reinterpret_cast<key_type*
>(
raw_pointer( temp_storage ) );
1056 thrust::device_ptr<key_type> keys_buf( keys_ptr );
1060 cuda::SortBuffers<key_type*> sort_buffers;
1061 sort_buffers.keys[0] = keys_ptr;
1062 sort_buffers.keys[1] = keys_ptr + n;
1064 cuda::SortEnactor sort_enactor;
1065 sort_enactor.sort( n, sort_buffers );
1068 keys_buf + sort_buffers.selector * n,
1069 keys_buf + sort_buffers.selector * n + n,
1079 template <
typename KeyIterator,
typename ValueIterator>
1081 const device_tag tag,
1084 ValueIterator values,
1087 typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
1088 typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
1090 const uint32 aligned_key_bytes = align<16>( 2 * n *
sizeof(key_type) );
1091 const uint32 aligned_val_bytes = 2 * n *
sizeof(value_type);
1094 key_type* keys_ptr =
reinterpret_cast<key_type*
>(
raw_pointer( temp_storage ) );
1095 value_type* values_ptr =
reinterpret_cast<value_type*
>(
raw_pointer( temp_storage ) + aligned_key_bytes );
1097 thrust::device_ptr<key_type> keys_buf( keys_ptr );
1098 thrust::device_ptr<key_type> values_buf( values_ptr );
1103 cuda::SortBuffers<key_type*, value_type*> sort_buffers;
1104 sort_buffers.keys[0] = keys_ptr;
1105 sort_buffers.keys[1] = keys_ptr + n;
1106 sort_buffers.values[0] = values_ptr;
1107 sort_buffers.values[1] = values_ptr + n;
1109 cuda::SortEnactor sort_enactor;
1110 sort_enactor.sort( n, sort_buffers );
1113 keys_buf + sort_buffers.selector * n,
1114 keys_buf + sort_buffers.selector * n + n,
1118 values_buf + sort_buffers.selector * n,
1119 values_buf + sort_buffers.selector * n + n,
1130 template <
typename KeyIterator>
1145 template <
typename system_tag,
typename KeyIterator>
1151 radix_sort( system_tag(), n, keys, temp_storage );
1160 template <
typename KeyIterator,
typename ValueIterator>
1165 ValueIterator values,
1168 thrust::sort_by_key( keys, keys + n, values, temp_storage );
1177 template <
typename system_tag,
typename KeyIterator,
typename ValueIterator>
1181 ValueIterator values,
1184 radix_sort( system_tag(), n, keys, values, temp_storage );
1188 typename key_iterator1,
1189 typename key_iterator2>
1192 const key_iterator1 A,
1194 const key_iterator2 B,
1200 int32 j_lo = i >= n ? i - n : 0;
1205 if ((j > 0 || k < n) && A[j-1] > B[k])
1212 assert( j + k == i );
1214 else if ((k > 0 || j < m) && B[k-1] >= A[j])
1221 assert( j + k == i );
1230 typename key_iterator1,
1231 typename key_iterator2,
1232 typename value_iterator1,
1233 typename value_iterator2,
1234 typename key_output,
1235 typename value_output>
1240 const key_iterator1 A_keys,
1241 const key_iterator2 B_keys,
1242 const value_iterator1 A_values,
1243 const value_iterator2 B_values,
1245 value_output C_values)
1249 #pragma omp parallel for
1252 C_keys[i] = A_keys[i];
1253 C_values[i] = A_values[i];
1256 else if (B_len == 0)
1258 #pragma omp parallel for
1261 C_keys[i] = A_keys[i];
1262 C_values[i] = A_values[i];
1271 const uint32 C_len = A_len + B_len;
1273 A_diag[ n_threads ] = 0;
1274 B_diag[ n_threads ] = 0;
1275 A_diag[ n_threads ] = A_len;
1276 B_diag[ n_threads ] = B_len;
1280 #pragma omp parallel for num_threads(n_threads)
1281 for (
int32 i = 1; i <
int32( n_threads ); ++i)
1283 const int32 index = i * n_partition;
1285 const uint2 jk =
corank( index, A_keys, A_len, B_keys, B_len );
1291 #pragma omp parallel for num_threads(n_threads)
1292 for (int32 i = 0; i <
int32( n_threads ); ++i)
1296 A_keys + A_diag[i+1],
1298 B_keys + B_diag[i+1],
1299 A_values + A_diag[i],
1300 B_values + B_diag[i],
1301 C_keys + i * n_partition,
1302 C_values + i * n_partition );
1315 typename key_iterator1,
1316 typename key_iterator2,
1317 typename value_iterator1,
1318 typename value_iterator2,
1319 typename key_output,
1320 typename value_output>
1325 const key_iterator1 A_keys,
1326 const key_iterator2 B_keys,
1327 const value_iterator1 A_values,
1328 const value_iterator2 B_values,
1330 value_output C_values)
1344 typename system_tag,
1345 typename key_iterator1,
1346 typename key_iterator2,
1347 typename value_iterator1,
1348 typename value_iterator2,
1349 typename key_output,
1350 typename value_output>
1354 const key_iterator1 A_keys,
1355 const key_iterator2 B_keys,
1356 const value_iterator1 A_values,
1357 const value_iterator2 B_values,
1359 value_output C_values,
1374 #if defined(__CUDACC__)
1378 template <
typename iterator_type,
typename functor_type>
1380 void for_each_kernel(
const uint64 n,
const iterator_type in,
const functor_type f)
1382 const uint32 grid_size = blockDim.x * gridDim.x;
1384 for (
uint64 i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += grid_size)
1392 template <
typename KernelFunction>
1393 uint32 for_each_enactor<device_tag>::suggested_blocks(KernelFunction kernel,
const uint32 cta_size)
const
1395 #if defined(__CUDACC__)
1396 if (m_blocks_hi == 0)
1398 else if (m_blocks_lo == 0)
1410 void for_each_enactor<device_tag>::update(
const uint32 n_blocks,
const float speed)
1412 #if defined(__CUDACC__)
1414 if (m_blocks_hi == 0)
1419 else if (m_blocks_lo == 0)
1424 else if (m_speed_lo > m_speed_hi)
1442 template <
typename Iterator,
typename Functor>
1448 #if defined(__CUDACC__)
1449 const uint32 blockdim = 128;
1450 const uint32 n_blocks = suggested_blocks( for_each_kernel<Iterator,Functor>, blockdim );
1455 for_each_kernel<<<n_blocks,blockdim>>>( n, in, functor );
1459 update( n_blocks,
float(n) / timer.
seconds() );