42 #include <thrust/device_vector.h>
43 #include <thrust/transform_scan.h>
44 #include <thrust/binary_search.h>
45 #include <thrust/iterator/constant_iterator.h>
46 #include <thrust/iterator/counting_iterator.h>
47 #include <thrust/sort.h>
48 #include <mgpuhost.cuh>
49 #include <moderngpu.cuh>
58 template <
typename string_type>
60 const typename string_type::index_type string_len,
61 const string_type
string)
67 return thrust::transform_reduce(
68 thrust::make_counting_iterator<uint32>(1u),
69 thrust::make_counting_iterator<uint32>(0u) + string_len,
74 thrust::plus<uint32>() ) + 1u;
79 template <
typename string_set_type,
typename output_handler>
81 const string_set_type& string_set,
82 output_handler& output,
90 NVBIO_VAR_UNUSED const uint32 SYMBOLS_PER_WORD = priv::symbols_per_word<SYMBOL_SIZE,WORD_BITS,DOLLAR_BITS>();
93 cudaGetDevice( ¤t_device );
98 suffixes.
set( string_set );
101 const uint32 m = (suffixes.
max_length( string_set ) + SYMBOLS_PER_WORD-1) / SYMBOLS_PER_WORD;
106 thrust::device_vector<word_type> radices( n_suffixes*2 );
107 thrust::device_vector<uint32> indices( n_suffixes*2 );
111 thrust::make_counting_iterator<uint32>(0u),
112 thrust::make_counting_iterator<uint32>(n_suffixes),
118 sort_buffers.selector = 0;
125 for (
int32 word_idx = m-1; word_idx >= 0; --word_idx)
132 indices.begin() + sort_buffers.selector * n_suffixes,
133 radices.begin() + sort_buffers.selector * n_suffixes );
136 sort_enactor.
sort( n_suffixes, sort_buffers );
148 template <
typename string_type,
typename output_iterator>
151 const string_type
string,
152 output_iterator output,
162 output[0] = string_len;
174 template <
typename string_type,
typename output_handler>
176 const typename string_type::index_type string_len,
178 output_handler& output,
181 typedef typename string_type::index_type index_type;
184 const size_t needed_bytes_64 = size_t( DCS::estimated_sample_size<64>( string_len ) ) * 8u;
185 const size_t needed_bytes_128 = size_t( DCS::estimated_sample_size<128>( string_len ) ) * 8u;
186 const size_t needed_bytes_256 = size_t( DCS::estimated_sample_size<256>( string_len ) ) * 8u;
187 const size_t needed_bytes_512 = size_t( DCS::estimated_sample_size<512>( string_len ) ) * 8u;
188 const size_t needed_bytes_1024 = size_t( DCS::estimated_sample_size<1024>( string_len ) ) * 8u;
191 cudaMemGetInfo(&free, &total);
195 if (free >= 2*needed_bytes_64)
197 else if (free >= 2*needed_bytes_128)
199 else if (free >= 2*needed_bytes_256)
201 else if (free >= 2*needed_bytes_512)
203 else if (free >= 2*needed_bytes_1024)
209 log_verbose(stderr,
" building DCS-%u... started\n", dcs.
Q);
217 log_verbose(stderr,
" building DCS-%u... done\n", dcs.
Q);
220 log_verbose(stderr,
" DCS-based sorting... started\n");
226 thrust::make_counting_iterator<uint32>(0u),
231 log_verbose(stderr,
" DCS-based sorting... done\n");
238 template <
typename string_type,
typename output_iterator>
239 typename string_type::index_type
bwt(
240 const typename string_type::index_type string_len,
242 output_iterator output,
245 typedef typename string_type::index_type index_type;
268 template <u
int32 BUCKETING_BITS_T, u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename storage_type>
271 typedef typename std::iterator_traits<storage_type>::value_type
word_type;
285 SYMBOL_SIZE,BIG_ENDIAN,storage_type,
290 template <u
int32 BUCKETING_BITS_T, u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename storage_type>
293 typedef typename std::iterator_traits<storage_type>::value_type
word_type;
307 SYMBOL_SIZE,BIG_ENDIAN,storage_type,
311 template <u
int32 BUCKETING_BITS_T, u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename storage_type>
314 typedef typename std::iterator_traits<storage_type>::value_type
word_type;
328 SYMBOL_SIZE,BIG_ENDIAN,storage_type,
345 operator bool()
const {
return code ==
OK ?
true :
false; }
352 template <
typename ConfigType, u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename storage_type>
355 typedef typename std::iterator_traits<storage_type>::value_type
word_type;
367 const thrust::host_vector<uint32>& h_buckets,
368 const uint64 max_super_block_size,
379 for (
uint32 bucket_begin = 0, bucket_end = 0; bucket_begin < h_buckets.size(); bucket_begin = bucket_end)
383 for (bucket_size = 0; (bucket_end < h_buckets.size()) && (bucket_size + h_buckets[bucket_end] <= max_super_block_size); ++bucket_end)
384 bucket_size += h_buckets[bucket_end];
389 if (bucket_end == bucket_begin)
390 throw nvbio::runtime_error(
"bucket %u contains %u strings: buffer overflow!", bucket_begin, h_buckets[bucket_begin]);
393 for (
uint32 subbucket = bucket_begin; subbucket < bucket_end; ++subbucket)
396 if ((subbucket & DOLLAR_MASK) == DOLLAR_MASK)
398 if (max_size < h_buckets[subbucket])
400 max_size = h_buckets[subbucket];
401 max_index = subbucket;
407 if (max_size > limit)
420 const thrust::host_vector<uint32>& h_buckets,
421 thrust::host_vector<uint32>& h_subbuckets,
422 const uint64 max_super_block_size,
423 const uint32 max_block_size)
429 for (
uint32 bucket_begin = 0, bucket_end = 0; bucket_begin < h_buckets.size(); bucket_begin = bucket_end)
433 for (bucket_size = 0; (bucket_end < h_buckets.size()) && (bucket_size + h_buckets[bucket_end] <= max_super_block_size); ++bucket_end)
434 bucket_size += h_buckets[bucket_end];
439 if (bucket_end == bucket_begin)
440 throw nvbio::runtime_error(
"bucket %u contains %u strings: buffer overflow!", bucket_begin, h_buckets[bucket_begin]);
443 for (
uint32 subbucket_begin = bucket_begin, subbucket_end = bucket_begin; subbucket_begin < bucket_end; subbucket_begin = subbucket_end)
445 if (h_buckets[subbucket_begin] > max_block_size)
448 if ((subbucket_begin & DOLLAR_MASK) == DOLLAR_MASK)
449 throw nvbio::runtime_error(
"bucket %u contains %u strings: buffer overflow!", subbucket_begin, h_buckets[subbucket_begin]);
452 h_subbuckets[ subbucket_end++ ] = subbucket_begin;
458 for (subbucket_size = 0; (subbucket_end < bucket_end) && (subbucket_size + h_buckets[subbucket_end] <= max_block_size); ++subbucket_end)
460 subbucket_size += h_buckets[subbucket_end];
462 h_subbuckets[ subbucket_end ] = subbucket_begin;
469 template <
typename output_handler>
472 output_handler& output,
486 cudaGetDevice( ¤t_device );
495 const uint64 max_super_block_size = params ?
498 uint32 max_block_size = params ?
502 log_verbose(stderr,
" super-block-size: %.1f M\n",
float(max_super_block_size)/
float(1024*1024));
503 log_verbose(stderr,
" block-size: %.1f M\n",
float(max_block_size)/
float(1024*1024));
504 thrust::host_vector<uint2> h_suffixes( max_super_block_size );
505 thrust::host_vector<uint8> h_block_bwt;
508 thrust::device_vector<uint32> d_indices;
509 thrust::device_vector<uint2> d_bucket_suffixes;
510 thrust::device_vector<uint8> d_block_bwt;
517 const uint32 n_buckets = 1u << BUCKETING_BITS;
519 thrust::host_vector<uint32> h_buckets( n_buckets );
520 thrust::host_vector<uint32> h_subbuckets( n_buckets );
523 const uint64 total_suffixes = bucketer.count( string_set, h_buckets );
530 thrust::maximum<uint32>() );
536 log_verbose(stderr,
" exceeded maximum bucket size\n");
540 log_verbose(stderr,
" max bucket size: %u (%u)\n", largest_subbucket, max_bucket_size);
541 bucketer.log_count_stats();
543 float bwt_time = 0.0f;
544 float output_time = 0.0f;
547 const uint32 block_size = max_block_size / 4u;
548 for (
uint32 block_begin = 0; block_begin < N; block_begin += block_size)
553 const uint32 n_suffixes = block_end - block_begin;
562 string_set_handler.dollar_bwt(
570 h_block_bwt.begin() + n_suffixes,
571 d_block_bwt.begin() );
596 output_time += timer.
seconds();
599 bucketer.clear_timers();
606 float sufsort_time = 0.0f;
607 float collect_time = 0.0f;
610 const uint32 optimal_block_size = 32*1024*1024;
611 if (largest_subbucket <= optimal_block_size)
612 max_block_size = optimal_block_size;
618 string_set_handler.reserve( max_block_size, SLICE_SIZE );
619 string_sorter.
reserve( max_block_size );
626 log_verbose(stderr,
" allocated device memory: %.1f MB\n",
627 float( bucketer.allocated_device_memory() +
628 string_set_handler.allocated_device_memory() +
630 d_block_bwt.size() *
sizeof(
uint8) +
631 d_indices.size() *
sizeof(
uint32) +
632 d_bucket_suffixes.size() *
sizeof(uint2)
633 ) / float(1024*1024) );
634 log_verbose(stderr,
" bucketer : %.1f MB\n",
float( bucketer.allocated_device_memory() ) /
float(1024*1024) );
635 log_verbose(stderr,
" handler : %.1f MB\n",
float( string_set_handler.allocated_device_memory() ) /
float(1024*1024) );
638 log_verbose(stderr,
" allocated host memory: %.1f MB\n",
639 float( bucketer.allocated_host_memory() +
640 string_set_handler.allocated_host_memory() +
641 h_block_bwt.size() *
sizeof(
uint8) +
642 h_suffixes.size() *
sizeof(uint2) +
643 h_buckets.size() *
sizeof(
uint32) +
644 h_subbuckets.size() *
sizeof(
uint32)
645 ) / float(1024*1024) );
652 max_super_block_size,
655 uint64 global_suffix_offset = 0;
657 for (
uint32 bucket_begin = 0, bucket_end = 0; bucket_begin < h_buckets.size(); bucket_begin = bucket_end)
661 for (bucket_size = 0; (bucket_end < h_buckets.size()) && (bucket_size + h_buckets[bucket_end] <= max_super_block_size); ++bucket_end)
662 bucket_size += h_buckets[bucket_end];
669 uint32 max_suffix_len = 0;
671 log_verbose(stderr,
" collect buckets[%u:%u] (%llu suffixes)\n", bucket_begin, bucket_end, bucket_size);
673 collect_timer.
start();
675 suffix_count = bucketer.collect(
683 collect_timer.
stop();
684 collect_time += collect_timer.
seconds();
685 log_verbose(stderr,
" collect : %.1fs (%.1f M suffixes/s - %.1f M scans/s)\n", collect_time, 1.0e-6f*
float(global_suffix_offset + suffix_count)/collect_time, 1.0e-6f*
float(total_suffixes)/collect_time);
686 bucketer.log_collect_stats();
695 const uint32 n_words = string_set_handler.num_words( max_suffix_len );
697 for (
uint32 subbucket_begin = bucket_begin, subbucket_end = bucket_begin; subbucket_begin < bucket_end; subbucket_begin = subbucket_end)
699 if (h_buckets[subbucket_begin] > max_block_size)
702 if ((subbucket_begin & DOLLAR_MASK) == DOLLAR_MASK)
703 throw nvbio::runtime_error(
"bucket %u contains %u strings: overflow!", subbucket_begin, h_buckets[subbucket_begin]);
708 const uint32 subbucket_size = h_buckets[subbucket_begin];
714 for (
uint32 block_begin = 0; block_begin < subbucket_size; block_begin += max_block_size)
716 const uint32 block_end =
nvbio::min( block_begin + max_block_size, subbucket_size );
719 const uint32 n_suffixes = block_end - block_begin;
722 const uint2* h_bucket_suffixes = &h_suffixes[0] + suffix_count + block_begin;
728 h_bucket_suffixes + n_suffixes,
729 d_bucket_suffixes.begin() );
732 string_set_handler.init( n_suffixes, h_bucket_suffixes,
nvbio::plain_view( d_bucket_suffixes ) );
741 string_set_handler.bwt(
770 output_time += timer.
seconds();
773 suffix_count += subbucket_size;
776 sufsort_time += suf_timer.
seconds();
782 for (subbucket_size = 0; (subbucket_end < bucket_end) && (subbucket_size + h_buckets[subbucket_end] <= max_block_size); ++subbucket_end)
783 subbucket_size += h_buckets[subbucket_end];
785 log_verbose(stderr,
"\r sufsort buckets[%u:%u] (%.1f M suffixes/s) ", subbucket_begin, subbucket_end, 1.0e-6f*
float(global_suffix_offset + suffix_count)/sufsort_time);
786 if (subbucket_size == 0)
790 const uint32 n_suffixes = subbucket_size;
799 log_error(stderr,
"LargeBWTSkeleton: d_indices allocation failed!\n");
807 const uint2* h_bucket_suffixes = &h_suffixes[0] + suffix_count;
814 h_bucket_suffixes + n_suffixes,
815 d_bucket_suffixes.begin() );
820 string_set_handler.init( n_suffixes, h_bucket_suffixes,
nvbio::plain_view( d_bucket_suffixes ) );
830 thrust::make_counting_iterator<uint32>(0u),
845 string_set_handler.bwt(
876 output_time += timer.
seconds();
878 suffix_count += subbucket_size;
881 sufsort_time += suf_timer.
seconds();
884 log_verbose(stderr,
"\r sufsort : %.1fs (%.1f M suffixes/s) \n", sufsort_time, 1.0e-6f*
float(global_suffix_offset + suffix_count)/sufsort_time);
892 log_verbose(stderr,
" output : %.1fs\n", output_time);
894 global_suffix_offset += suffix_count;
902 template <u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename storage_type,
typename output_handler>
907 output_handler& output,
945 template <u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename storage_type,
typename output_handler>
950 output_handler& output,
965 if (bucketing_bits <= 16u)
975 if (bucketing_bits <= 20u)
985 if (bucketing_bits <= 24u)
1009 if (bucketing_bits <= 16u)
1019 if (bucketing_bits <= 20u)
1029 if (bucketing_bits <= 24u)