38 #include <thrust/device_vector.h>
39 #include <thrust/transform_scan.h>
40 #include <thrust/binary_search.h>
41 #include <thrust/iterator/constant_iterator.h>
42 #include <thrust/iterator/counting_iterator.h>
43 #include <thrust/sort.h>
44 #include <mgpuhost.cuh>
45 #include <moderngpu.cuh>
58 template <
typename string_type,
typename suffix_iterator,
typename output_handler>
60 const typename string_type::index_type string_len,
62 const typename string_type::index_type n_suffixes,
63 suffix_iterator suffixes,
64 output_handler& output,
68 typedef typename string_type::index_type index_type;
76 const uint32 M = 1024*1024;
77 const index_type N = n_suffixes;
79 const uint32 n_chunks = (n_suffixes + M-1) / M;
83 cudaMemGetInfo(&free, &total);
94 const uint64 max_device_memory = params ?
100 uint32 needed_device_mem = 0;
103 for (max_block_size = 32*1024*1024; max_block_size >= 1024*1024; max_block_size /= 2)
105 const uint32 n_buckets = 1u << (BUCKETING_BITS);
110 n_buckets *
sizeof(
uint32) * 3 +
114 if (needed_device_mem <= max_device_memory)
118 const uint32 DELAY_BUFFER = 1024*1024;
120 log_verbose(stderr,
" super-block-size: %.1f M\n",
float(max_super_block_size)/
float(1024*1024));
121 log_verbose(stderr,
" block-size: %.1f M\n",
float(max_block_size)/
float(1024*1024));
122 thrust::host_vector<uint32> h_super_suffixes( max_super_block_size, 0u );
123 thrust::host_vector<uint32> h_block_suffixes( max_block_size );
124 thrust::host_vector<uint32> h_block_radices( max_block_size );
126 log_verbose(stderr,
" device-alloc(%.1f GB)... started\n",
float(needed_device_mem)/
float(1024*1024*1024));
127 log_verbose(stderr,
" free: %.1f GB\n",
float(free)/
float(1024*1024*1024));
129 thrust::device_vector<uint32> d_subbucket_suffixes( max_block_size );
130 thrust::device_vector<uint32> d_delayed_suffixes( max_block_size );
131 thrust::device_vector<uint32> d_delayed_slots( max_block_size );
132 thrust::device_vector<uint8> d_block_bwt( max_block_size );
135 d_delayed_suffixes.begin(),
136 d_delayed_slots.begin() );
139 cudaGetDevice( ¤t_device );
142 #define COMPRESSION_SORTING
143 #if defined(COMPRESSION_SORTING)
145 compression_sort.
reserve( max_block_size );
150 log_verbose(stderr,
" device-alloc(%.1f GB)... done\n",
float(needed_device_mem)/
float(1024*1024*1024));
154 thrust::device_vector<uint32> d_buckets( 1u << (BUCKETING_BITS), 0u );
156 for (
uint32 chunk_id = 0; chunk_id < n_chunks; ++chunk_id)
158 const index_type chunk_begin = chunk_id * M;
159 const index_type chunk_end =
nvbio::min( chunk_begin + M, N );
160 const uint32 chunk_size =
uint32( chunk_end - chunk_begin );
163 bucketer.
count( chunk_size, suffixes + chunk_begin, string_len,
string );
171 thrust::plus<uint32>() );
174 thrust::host_vector<uint32> h_buckets( d_buckets );
175 thrust::host_vector<uint32> h_bucket_offsets( d_buckets.size() );
176 thrust::host_vector<uint32> h_subbuckets( d_buckets.size() );
182 thrust::maximum<uint32>() );
184 log_verbose(stderr,
" max bucket size: %u\n", max_bucket_size);
196 h_bucket_offsets.begin() );
199 for (
uint32 bucket_begin = 0, bucket_end = 0; bucket_begin < h_buckets.size(); bucket_begin = bucket_end)
203 for (bucket_size = 0; (bucket_end < h_buckets.size()) && (bucket_size + h_buckets[bucket_end] < max_super_block_size); ++bucket_end)
204 bucket_size += h_buckets[bucket_end];
207 for (
uint32 subbucket_begin = bucket_begin, subbucket_end = bucket_begin; subbucket_begin < bucket_end; subbucket_begin = subbucket_end)
210 if (h_buckets[subbucket_begin] > max_block_size)
211 throw nvbio::runtime_error(
"bucket %u contains %u strings: buffer overflow!", subbucket_begin, h_buckets[subbucket_begin]);
215 for (subbucket_size = 0; (subbucket_end < bucket_end) && (subbucket_size + h_buckets[subbucket_end] < max_block_size); ++subbucket_end)
217 subbucket_size += h_buckets[subbucket_end];
219 h_subbuckets[ subbucket_end ] = subbucket_begin;
225 thrust::device_vector<uint32> d_subbuckets( h_subbuckets );
232 index_type global_suffix_offset = 0;
234 for (
uint32 bucket_begin = 0, bucket_end = 0; bucket_begin < h_buckets.size(); bucket_begin = bucket_end)
238 for (bucket_size = 0; (bucket_end < h_buckets.size()) && (bucket_size + h_buckets[bucket_end] < max_super_block_size); ++bucket_end)
239 bucket_size += h_buckets[bucket_end];
243 log_verbose(stderr,
" collect buckets[%u:%u] (%u suffixes)\n", bucket_begin, bucket_end, bucket_size);
245 collect_timer.
start();
247 for (
uint32 chunk_id = 0; chunk_id < n_chunks; ++chunk_id)
249 const index_type chunk_begin = chunk_id * M;
250 const index_type chunk_end =
nvbio::min( chunk_begin + M, N );
251 const uint32 chunk_size =
uint32( chunk_end - chunk_begin );
256 suffixes + chunk_begin,
261 d_subbuckets.begin(),
262 h_block_radices.begin(),
263 h_block_suffixes.begin() );
266 for (
uint32 i = 0; i < n_collected; ++i)
268 const uint32 loc = h_block_suffixes[i];
269 const uint32 bucket = h_block_radices[i];
270 const uint64 slot = h_bucket_offsets[bucket]++;
273 slot >= global_suffix_offset,
274 slot < global_suffix_offset + max_super_block_size,
275 "[%u] = %u placed at %llu - %llu (%u)\n", i, loc, slot, global_suffix_offset, bucket );
277 h_super_suffixes[ slot - global_suffix_offset ] = loc;
280 suffix_count += n_collected;
282 if (suffix_count > max_super_block_size)
284 log_error(stderr,
"buffer size exceeded! (%u/%u)\n", suffix_count, max_block_size);
289 collect_timer.
stop();
290 collect_time += collect_timer.
seconds();
292 log_verbose(stderr,
" collect : %.1fs, %.1f M suffixes/s\n", collect_time, 1.0e-6f*
float(global_suffix_offset + suffix_count)/collect_time);
304 NVBIO_VAR_UNUSED const uint32 SYMBOLS_PER_WORD = priv::symbols_per_word<SYMBOL_SIZE, WORD_BITS,DOLLAR_BITS>();
308 for (
uint32 subbucket_begin = bucket_begin, subbucket_end = bucket_begin; subbucket_begin < bucket_end; subbucket_begin = subbucket_end)
312 for (subbucket_size = 0; (subbucket_end < bucket_end) && (subbucket_size + h_buckets[subbucket_end] < max_block_size); ++subbucket_end)
313 subbucket_size += h_buckets[subbucket_end];
315 log_verbose(stderr,
"\r sufsort buckets[%u:%u] (%u suffixes, %.1f M suffixes/s) ", subbucket_begin, subbucket_end, subbucket_size, 1.0e-6f*
float(global_suffix_offset + suffix_count)/sufsort_time );
316 log_debug(stderr,
" compression-sort\n");
319 const uint32 n_suffixes = subbucket_size;
326 h_super_suffixes.begin() + suffix_count,
327 h_super_suffixes.begin() + suffix_count + n_suffixes,
328 d_subbucket_suffixes.begin() );
330 #if defined(COMPRESSION_SORTING)
331 delay_list.set_offset( global_suffix_offset + suffix_count );
336 compression_sort.
sort(
340 d_subbucket_suffixes.begin(),
344 #else // if defined(COMPRESSION_SORTING)
349 d_subbucket_suffixes.begin(),
350 d_subbucket_suffixes.begin() + n_suffixes,
360 d_subbucket_suffixes.begin(),
361 d_subbucket_suffixes.begin() + n_suffixes,
367 sufsort_time += timer.
seconds();
370 output.process_batch(
374 #if defined(COMPRESSION_SORTING)
376 if (delay_list.count &&
377 (delay_list.count >= DELAY_BUFFER ||
378 subbucket_end == bucket_end))
380 log_debug(stderr,
" sort delayed-suffixes (%u)\n", delay_list.count);
388 delay_list.indices + delay_list.count,
406 compression_sort.
sort(
418 sufsort_time += timer.
seconds();
423 output.process_scattered(
433 suffix_count += subbucket_size;
436 log_verbose(stderr,
"\r sufsort : %.1fs (%.1f M suffixes/s) \n", sufsort_time, 1.0e-6f*
float(global_suffix_offset + suffix_count)/sufsort_time);
438 #if defined(COMPRESSION_SORTING)
444 log_verbose(stderr,
" bwt-copy : %.1fs\n", bwt_copy_time);
445 log_verbose(stderr,
" bwt-scat : %.1fs\n", bwt_scatter_time);
448 global_suffix_offset += suffix_count;
454 template <
typename string_type>
457 const typename string_type::index_type string_len,
461 typedef typename string_type::index_type index_type;
463 const index_type block_size = 16*1024*1024;
472 thrust::device_vector<uint8> d_temp_storage;
474 index_type estimated_sample_size = 0u;
476 for (index_type block_begin = 0; block_begin < string_len; block_begin += block_size)
479 block_begin + block_size,
485 uint32( block_end - block_begin ),
488 thrust::make_counting_iterator<uint32>(0u) + block_begin, in_dcs ),
490 thrust::plus<uint32>(),
493 log_verbose(stderr,
" allocating DCS: %.1f MB\n",
float(
size_t( estimated_sample_size )*8u)/
float(1024*1024));
495 thrust::device_vector<uint32> d_sample( estimated_sample_size );
497 index_type sample_size = 0;
499 for (index_type block_begin = 0; block_begin < string_len; block_begin += block_size)
502 block_begin + block_size,
508 uint32( block_end - block_begin ),
509 thrust::make_counting_iterator<uint32>(0u) + block_begin,
510 d_sample.begin() + sample_size,
514 if (sample_size > estimated_sample_size)
516 log_error(stderr,
" DCS buffer overflow (%llu / %llu)\n",
uint64( sample_size ),
uint64( estimated_sample_size ));