34 #include <thrust/device_vector.h>
35 #include <thrust/transform_scan.h>
36 #include <thrust/binary_search.h>
37 #include <thrust/iterator/constant_iterator.h>
38 #include <thrust/iterator/counting_iterator.h>
39 #include <thrust/sort.h>
40 #include <mgpuhost.cuh>
41 #include <moderngpu.cuh>
62 template <
typename OutputIterator>
66 OutputIterator _indices,
67 OutputIterator _slots) :
83 template <
typename InputIterator>
86 InputIterator in_indices,
87 InputIterator in_slots)
91 in_indices + in_count,
114 template <
typename InputIterator>
117 InputIterator delay_indices,
118 InputIterator delay_slots)
164 template <
typename string_type,
typename output_iterator,
typename delay_list_type>
166 const typename string_type::index_type string_len,
167 const string_type
string,
169 output_iterator d_suffixes,
170 const uint32 delay_min_threshold,
171 const uint32 delay_max_threshold,
172 delay_list_type& delay_list);
207 template <
typename set_type,
typename input_iterator,
typename output_iterator,
typename delay_list_type>
212 input_iterator d_input,
213 output_iterator d_output,
214 const uint32 delay_threshold,
215 delay_list_type& delay_list,
216 const uint32 slice_size = 8u);
234 fprintf(stderr,
"CompressionSort::reserve() : exception caught!\n");
248 (n+32) *
sizeof(
uint8) +
249 (n+32) *
sizeof(
uint8) +
250 (n+32) *
sizeof(
uint8);
258 d_temp_storage.size() *
sizeof(
uint8) +
259 d_temp_indices.size() *
sizeof(
uint32) +
260 d_indices.size() *
sizeof(
uint32) +
261 d_keys.size() *
sizeof(
uint32) +
262 d_active_slots.size() *
sizeof(
uint32) +
263 d_segment_flags.size() *
sizeof(
uint8) +
264 d_copy_flags.size() *
sizeof(
uint8) +
265 d_temp_flags.size() *
sizeof(
uint8);
277 thrust::device_vector<uint8> d_temp_storage;
278 thrust::device_vector<uint32> d_temp_indices;
279 thrust::device_vector<uint32> d_indices;
280 thrust::device_vector<uint32> d_keys;
281 thrust::device_vector<uint32> d_active_slots;
282 thrust::device_vector<uint8> d_segment_flags;
283 thrust::device_vector<uint8> d_copy_flags;
284 thrust::device_vector<uint8> d_temp_flags;
300 template <
typename string_type,
typename output_iterator,
typename delay_list_type>
302 const typename string_type::index_type string_len,
303 const string_type
string,
305 output_iterator d_suffixes,
306 const uint32 delay_min_threshold,
307 const uint32 delay_max_threshold,
308 delay_list_type& delay_list)
310 typedef typename string_type::index_type index_type;
327 d_suffixes + n_suffixes,
332 thrust::make_constant_iterator<uint32>(0u),
333 thrust::make_constant_iterator<uint32>(0u) + n_suffixes,
338 thrust::make_counting_iterator<uint32>(0u),
339 thrust::make_counting_iterator<uint32>(0u) + n_suffixes,
340 d_active_slots.begin() );
343 d_segment_flags[0] = 1u;
345 d_segment_flags.begin() + 1u,
346 d_segment_flags.begin() + n_suffixes,
350 uint32 n_active_suffixes = n_suffixes;
364 for (
uint32 word_idx = 0;
true; ++word_idx)
366 if ((word_idx >= delay_min_threshold && 1000 * n_active_suffixes <= n_suffixes) ||
367 (word_idx >= delay_max_threshold))
369 delay_list.push_back(
372 d_active_slots.begin() );
386 d_indices.begin() + n_active_suffixes,
411 mgpu::SegSortPairsFromFlags(
441 d_segment_flags[0] = 1u;
442 d_segment_flags[n_active_suffixes] = 1u;
450 thrust::plus<uint32>(),
457 const uint32 n_segments = d_keys[ n_active_suffixes - 1u ];
460 if (n_segments == n_active_suffixes)
466 d_indices.begin() + n_active_suffixes,
467 d_active_slots.begin(),
474 #if defined(KEY_PRUNING)
490 thrust::adjacent_difference(
491 d_segment_flags.begin(),
492 d_segment_flags.begin() + n_active_suffixes+1u,
493 d_copy_flags.begin(),
499 thrust::plus<uint32>(),
504 if (2u*n_partials <= n_active_suffixes)
513 d_indices.begin() + n_active_suffixes,
514 d_active_slots.begin(),
517 thrust::device_vector<uint32>& d_temp_indices = d_keys;
523 d_copy_flags.begin() + 1u,
524 d_temp_indices.begin(),
525 d_temp_storage ) != n_partials)
526 throw nvbio::runtime_error(
"mismatching number of partial indices %u != %u\n", n_active, n_partials);
528 d_indices.swap( d_temp_indices );
533 d_active_slots.begin(),
534 d_copy_flags.begin() + 1u,
535 d_temp_indices.begin(),
536 d_temp_storage ) != n_partials)
537 throw nvbio::runtime_error(
"mismatching number of partial slots %u != %u\n", n_active, n_partials);
539 d_active_slots.swap( d_temp_indices );
544 d_segment_flags.begin(),
545 d_copy_flags.begin() + 1u,
546 d_temp_flags.begin(),
547 d_temp_storage ) != n_partials)
548 throw nvbio::runtime_error(
"mismatching number of partial flags %u != %u\n", n_active, n_partials);
550 d_segment_flags.swap( d_temp_flags );
553 n_active_suffixes = n_partials;
561 #endif // if defined(KEY_PRUNING)
566 log_error(stderr,
"CompressionSort::sort() : cuda_error caught!\n %s\n", error.
what());
571 log_error(stderr,
"CompressionSort::sort() : exception caught!\n");
585 template <
typename set_type,
typename input_iterator,
typename output_iterator,
typename delay_list_type>
590 input_iterator d_input,
591 output_iterator d_output,
592 const uint32 delay_threshold,
593 delay_list_type& delay_list,
598 typedef uint32 index_type;
615 thrust::make_counting_iterator<uint32>(0u),
616 thrust::make_counting_iterator<uint32>(0u) + n_strings,
617 d_active_slots.begin() );
620 d_segment_flags[0] = 1u;
622 d_segment_flags.begin() + 1u,
623 d_segment_flags.begin() + n_strings,
627 uint32 n_active_strings = n_strings;
641 for (
uint32 word_block_begin = 0; word_block_begin < max_words; word_block_begin += slice_size)
644 const uint32 word_block_end =
nvbio::min( word_block_begin + slice_size, max_words );
649 #if defined(CULLED_EXTRACTION)
670 for (
uint32 word_idx = word_block_begin; word_idx < word_block_end; ++word_idx)
672 if (word_idx > delay_threshold && 1000 * n_active_strings <= n_strings)
674 delay_list.push_back(
677 d_active_slots.begin() );
682 #if defined(CULLED_EXTRACTION)
721 d_indices.begin() + n_active_strings,
722 d_temp_indices.begin(),
744 mgpu::SegSortPairsFromFlags(
774 d_segment_flags[0] = 1u;
775 d_segment_flags[n_active_strings] = 1u;
783 thrust::plus<uint32>(),
790 const uint32 n_segments = d_keys[ n_active_strings - 1u ];
793 if (n_segments == n_active_strings ||
794 word_idx+1 == max_words)
798 if (n_active_strings == n_strings)
804 d_indices.begin() + n_active_strings,
814 d_indices.begin() + n_active_strings,
815 d_active_slots.begin(),
825 if (word_block_end < max_words)
842 thrust::adjacent_difference(
843 d_segment_flags.begin(),
844 d_segment_flags.begin() + n_active_strings+1u,
845 d_copy_flags.begin(),
851 thrust::plus<uint32>(),
856 if (2u*n_partials <= n_active_strings)
865 d_indices.begin() + n_active_strings,
866 d_active_slots.begin(),
878 thrust::device_vector<uint32>& d_temp_indices = d_keys;
884 d_copy_flags.begin() + 1u,
885 d_temp_indices.begin(),
888 d_indices.swap( d_temp_indices );
893 d_active_slots.begin(),
894 d_copy_flags.begin() + 1u,
895 d_temp_indices.begin(),
898 d_active_slots.swap( d_temp_indices );
903 d_segment_flags.begin(),
904 d_copy_flags.begin() + 1u,
905 d_temp_flags.begin(),
908 d_segment_flags.swap( d_temp_flags );
911 n_active_strings = n_partials;
922 log_error(stderr,
"CompressionSort::sort() : cuda_error caught!\n %s\n", error.
what());
927 log_error(stderr,
"CompressionSort::sort() : exception caught!\n");