NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
blockwise_sufsort.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
33 #include <nvbio/sufsort/dcs.h>
36 #include <nvbio/basic/cuda/sort.h>
37 #include <nvbio/basic/timer.h>
38 #include <thrust/device_vector.h>
39 #include <thrust/transform_scan.h>
40 #include <thrust/binary_search.h>
41 #include <thrust/iterator/constant_iterator.h>
42 #include <thrust/iterator/counting_iterator.h>
43 #include <thrust/sort.h>
44 #include <mgpuhost.cuh>
45 #include <moderngpu.cuh>
46 
47 
48 namespace nvbio {
49 namespace cuda {
50 
53 
58 template <typename string_type, typename suffix_iterator, typename output_handler>
60  const typename string_type::index_type string_len,
61  string_type string,
62  const typename string_type::index_type n_suffixes,
63  suffix_iterator suffixes,
64  output_handler& output,
65  const DCS* dcs,
66  BWTParams* params)
67 {
68  typedef typename string_type::index_type index_type;
69  typedef uint32 word_type;
70 
71  NVBIO_VAR_UNUSED const uint32 SYMBOL_SIZE = string_type::SYMBOL_SIZE;
72  NVBIO_VAR_UNUSED const uint32 BUCKETING_BITS = 20;
73  NVBIO_VAR_UNUSED const uint32 DOLLAR_BITS = 4;
74  NVBIO_VAR_UNUSED const uint32 WORD_BITS = uint32( 8u * sizeof(uint32) );
75 
76  const uint32 M = 1024*1024; // requires M * 16 device memory bytes
77  const index_type N = n_suffixes;
78 
79  const uint32 n_chunks = (n_suffixes + M-1) / M;
80 
81  // check the amount of available device memory
82  size_t free, total;
83  cudaMemGetInfo(&free, &total);
84 
85  //const size_t max_super_block_mem = free - max_block_size*16u - 512u*1024u*1024u;
86  //const uint32 max_super_block_size = uint32( max_super_block_mem / 4u );
87  const uint32 max_super_block_size = nvbio::min( // requires max_super_block_size*4 host memory bytes
88  index_type( params ?
89  (params->host_memory - (128u*1024u*1024u)) / 4u : // leave 128MB for the bucket counters
90  512*1024*1024 ),
91  string_len );
92 
93  // determine the amount of available device memory
94  const uint64 max_device_memory = params ?
95  nvbio::min( uint64( free ), uint64( params->device_memory ) ) :
96  uint64( free );
97 
98  // determine the maximum block size
99  uint32 max_block_size = 0;
100  uint32 needed_device_mem = 0;
101 
102  // start from a work-optimal block size and progressively halve it until we fit our memory budget
103  for (max_block_size = 32*1024*1024; max_block_size >= 1024*1024; max_block_size /= 2)
104  {
105  const uint32 n_buckets = 1u << (BUCKETING_BITS);
106 
107  // compute the amount of needed device memory
108  needed_device_mem = CompressionSort::needed_device_memory( max_block_size ) + // CompressionSort requirements
109  max_block_size*(sizeof(uint32)*3+sizeof(uint8)) + // d_subbucket_suffixes + d_delayed_suffixes + d_delayed_slots + d_block_bwt
110  n_buckets * sizeof(uint32) * 3 + // d_buckets + d_subbuckets + bucketer.d_buckets
111  128*1024*1024; // scraps
112 
113  // check whether we fit in our budget
114  if (needed_device_mem <= max_device_memory)
115  break;
116  }
117 
118  const uint32 DELAY_BUFFER = 1024*1024;
119 
120  log_verbose(stderr," super-block-size: %.1f M\n", float(max_super_block_size)/float(1024*1024));
121  log_verbose(stderr," block-size: %.1f M\n", float(max_block_size)/float(1024*1024));
122  thrust::host_vector<uint32> h_super_suffixes( max_super_block_size, 0u );
123  thrust::host_vector<uint32> h_block_suffixes( max_block_size );
124  thrust::host_vector<uint32> h_block_radices( max_block_size );
125 
126  log_verbose(stderr," device-alloc(%.1f GB)... started\n", float(needed_device_mem)/float(1024*1024*1024));
127  log_verbose(stderr," free: %.1f GB\n", float(free)/float(1024*1024*1024));
128 
129  thrust::device_vector<uint32> d_subbucket_suffixes( max_block_size );
130  thrust::device_vector<uint32> d_delayed_suffixes( max_block_size );
131  thrust::device_vector<uint32> d_delayed_slots( max_block_size );
132  thrust::device_vector<uint8> d_block_bwt( max_block_size );
133 
135  d_delayed_suffixes.begin(),
136  d_delayed_slots.begin() );
137 
138  int current_device;
139  cudaGetDevice( &current_device );
140  mgpu::ContextPtr mgpu_ctxt = mgpu::CreateCudaDevice( current_device );
141 
142  #define COMPRESSION_SORTING
143  #if defined(COMPRESSION_SORTING)
144  CompressionSort compression_sort( mgpu_ctxt );
145  compression_sort.reserve( max_block_size );
146  #endif
147 
149 
150  log_verbose(stderr," device-alloc(%.1f GB)... done\n", float(needed_device_mem)/float(1024*1024*1024));
151  log_verbose(stderr," bucket counting\n");
152 
153  // global bucket sizes
154  thrust::device_vector<uint32> d_buckets( 1u << (BUCKETING_BITS), 0u );
155 
156  for (uint32 chunk_id = 0; chunk_id < n_chunks; ++chunk_id)
157  {
158  const index_type chunk_begin = chunk_id * M;
159  const index_type chunk_end = nvbio::min( chunk_begin + M, N );
160  const uint32 chunk_size = uint32( chunk_end - chunk_begin );
161 
162  // count the chunk's buckets
163  bucketer.count( chunk_size, suffixes + chunk_begin, string_len, string );
164 
165  // and merge them in with the global buckets
167  bucketer.d_buckets.begin(),
168  bucketer.d_buckets.end(),
169  d_buckets.begin(),
170  d_buckets.begin(),
171  thrust::plus<uint32>() );
172  }
173 
174  thrust::host_vector<uint32> h_buckets( d_buckets );
175  thrust::host_vector<uint32> h_bucket_offsets( d_buckets.size() );
176  thrust::host_vector<uint32> h_subbuckets( d_buckets.size() );
177 
178  NVBIO_VAR_UNUSED const uint32 max_bucket_size = thrust::reduce(
179  d_buckets.begin(),
180  d_buckets.end(),
181  0u,
182  thrust::maximum<uint32>() );
183 
184  log_verbose(stderr," max bucket size: %u\n", max_bucket_size);
185  log_verbose(stderr," c-sort : %.1fs\n", bucketer.d_count_sort_time);
186 
187  //
188  // at this point, we have to do multiple passes through the input string set,
189  // collecting in each pass as many buckets as we can fit in memory at once
190  //
191 
192  // scan the bucket offsets so as to have global positions
194  h_buckets.begin(),
195  h_buckets.end(),
196  h_bucket_offsets.begin() );
197 
198  // build the subbucket pointers
199  for (uint32 bucket_begin = 0, bucket_end = 0; bucket_begin < h_buckets.size(); bucket_begin = bucket_end)
200  {
201  // grow the block of buckets until we can
202  uint32 bucket_size;
203  for (bucket_size = 0; (bucket_end < h_buckets.size()) && (bucket_size + h_buckets[bucket_end] < max_super_block_size); ++bucket_end)
204  bucket_size += h_buckets[bucket_end];
205 
206  // build the sub-buckets
207  for (uint32 subbucket_begin = bucket_begin, subbucket_end = bucket_begin; subbucket_begin < bucket_end; subbucket_begin = subbucket_end)
208  {
209  // check if this bucket is too large
210  if (h_buckets[subbucket_begin] > max_block_size)
211  throw nvbio::runtime_error("bucket %u contains %u strings: buffer overflow!", subbucket_begin, h_buckets[subbucket_begin]);
212 
213  // grow the block of sub-buckets until we can
214  uint32 subbucket_size;
215  for (subbucket_size = 0; (subbucket_end < bucket_end) && (subbucket_size + h_buckets[subbucket_end] < max_block_size); ++subbucket_end)
216  {
217  subbucket_size += h_buckets[subbucket_end];
218 
219  h_subbuckets[ subbucket_end ] = subbucket_begin; // point to the beginning of this sub-bucket
220  }
221  }
222  }
223 
224  // build the subbucket pointers
225  thrust::device_vector<uint32> d_subbuckets( h_subbuckets );
226 
227  NVBIO_VAR_UNUSED float sufsort_time = 0.0f;
228  NVBIO_VAR_UNUSED float collect_time = 0.0f;
229  NVBIO_VAR_UNUSED float bwt_copy_time = 0.0f;
230  NVBIO_VAR_UNUSED float bwt_scatter_time = 0.0f;
231 
232  index_type global_suffix_offset = 0;
233 
234  for (uint32 bucket_begin = 0, bucket_end = 0; bucket_begin < h_buckets.size(); bucket_begin = bucket_end)
235  {
236  // grow the block of buckets until we can
237  uint32 bucket_size;
238  for (bucket_size = 0; (bucket_end < h_buckets.size()) && (bucket_size + h_buckets[bucket_end] < max_super_block_size); ++bucket_end)
239  bucket_size += h_buckets[bucket_end];
240 
241  uint32 suffix_count = 0;
242 
243  log_verbose(stderr," collect buckets[%u:%u] (%u suffixes)\n", bucket_begin, bucket_end, bucket_size);
244  Timer collect_timer;
245  collect_timer.start();
246 
247  for (uint32 chunk_id = 0; chunk_id < n_chunks; ++chunk_id)
248  {
249  const index_type chunk_begin = chunk_id * M;
250  const index_type chunk_end = nvbio::min( chunk_begin + M, N );
251  const uint32 chunk_size = uint32( chunk_end - chunk_begin );
252 
253  // collect the chunk's suffixes within the bucket range
254  const uint32 n_collected = bucketer.collect(
255  chunk_size,
256  suffixes + chunk_begin,
257  string_len,
258  string,
259  bucket_begin,
260  bucket_end,
261  d_subbuckets.begin(),
262  h_block_radices.begin(),
263  h_block_suffixes.begin() );
264 
265  // dispatch each suffix to their respective bucket
266  for (uint32 i = 0; i < n_collected; ++i)
267  {
268  const uint32 loc = h_block_suffixes[i];
269  const uint32 bucket = h_block_radices[i];
270  const uint64 slot = h_bucket_offsets[bucket]++; // this could be done in parallel using atomics
271 
273  slot >= global_suffix_offset,
274  slot < global_suffix_offset + max_super_block_size,
275  "[%u] = %u placed at %llu - %llu (%u)\n", i, loc, slot, global_suffix_offset, bucket );
276 
277  h_super_suffixes[ slot - global_suffix_offset ] = loc;
278  }
279 
280  suffix_count += n_collected;
281 
282  if (suffix_count > max_super_block_size)
283  {
284  log_error(stderr,"buffer size exceeded! (%u/%u)\n", suffix_count, max_block_size);
285  exit(1);
286  }
287  }
288  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
289  collect_timer.stop();
290  collect_time += collect_timer.seconds();
291 
292  log_verbose(stderr," collect : %.1fs, %.1f M suffixes/s\n", collect_time, 1.0e-6f*float(global_suffix_offset + suffix_count)/collect_time);
293  log_verbose(stderr," setup : %.1fs\n", bucketer.d_setup_time);
294  log_verbose(stderr," flatten : %.1fs\n", bucketer.d_flatten_time);
295  log_verbose(stderr," b-sort : %.1fs\n", bucketer.d_collect_sort_time);
296  log_verbose(stderr," search : %.1fs\n", bucketer.d_remap_time);
297  log_verbose(stderr," copy : %.1fs\n", bucketer.d_copy_time);
298 
299  //
300  // at this point we have a large collection of localized suffixes to sort in d_subbucket_suffixes;
301  // we'll do it looping on multiple sub-buckets, on the GPU
302  //
303 
304  NVBIO_VAR_UNUSED const uint32 SYMBOLS_PER_WORD = priv::symbols_per_word<SYMBOL_SIZE, WORD_BITS,DOLLAR_BITS>();
305 
306  suffix_count = 0u;
307 
308  for (uint32 subbucket_begin = bucket_begin, subbucket_end = bucket_begin; subbucket_begin < bucket_end; subbucket_begin = subbucket_end)
309  {
310  // grow the block of sub-buckets until we can
311  uint32 subbucket_size;
312  for (subbucket_size = 0; (subbucket_end < bucket_end) && (subbucket_size + h_buckets[subbucket_end] < max_block_size); ++subbucket_end)
313  subbucket_size += h_buckets[subbucket_end];
314 
315  log_verbose(stderr,"\r sufsort buckets[%u:%u] (%u suffixes, %.1f M suffixes/s) ", subbucket_begin, subbucket_end, subbucket_size, 1.0e-6f*float(global_suffix_offset + suffix_count)/sufsort_time );
316  log_debug(stderr, " compression-sort\n");
317 
318  // consume subbucket_size suffixes
319  const uint32 n_suffixes = subbucket_size;
320 
321  Timer timer;
322  timer.start();
323 
324  // initialize the device sorting indices
325  thrust::copy(
326  h_super_suffixes.begin() + suffix_count,
327  h_super_suffixes.begin() + suffix_count + n_suffixes,
328  d_subbucket_suffixes.begin() );
329 
330  #if defined(COMPRESSION_SORTING)
331  delay_list.set_offset( global_suffix_offset + suffix_count );
332 
333  const uint32 min_delay_pass = dcs == NULL ? 16u : nvbio::max( util::divide_ri( dcs->Q, SYMBOLS_PER_WORD ), 8u );
334  const uint32 max_delay_pass = dcs == NULL ? 1000u : nvbio::max( util::divide_ri( dcs->Q, SYMBOLS_PER_WORD ), 24u );
335 
336  compression_sort.sort(
337  string_len, // the main string length
338  string, // the main string
339  n_suffixes, // number of suffixes to sort
340  d_subbucket_suffixes.begin(), // the suffixes to sort
341  min_delay_pass, // minimum number of words to sort before possibly delaying
342  max_delay_pass, // maximum number of words to sort before delaying
343  delay_list ); // the delay list
344  #else // if defined(COMPRESSION_SORTING)
345  if (dcs)
346  {
347  // and sort the corresponding suffixes
348  thrust::stable_sort(
349  d_subbucket_suffixes.begin(),
350  d_subbucket_suffixes.begin() + n_suffixes,
352  string_len,
353  string,
354  nvbio::plain_view( *dcs ) ) );
355  }
356  else
357  {
358  // and sort the corresponding suffixes
359  thrust::stable_sort(
360  d_subbucket_suffixes.begin(),
361  d_subbucket_suffixes.begin() + n_suffixes,
363  }
364  #endif
365  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
366  timer.stop();
367  sufsort_time += timer.seconds();
368 
369  // process the (partially) sorted block
370  output.process_batch(
371  n_suffixes,
372  nvbio::plain_view( d_subbucket_suffixes ) );
373 
374  #if defined(COMPRESSION_SORTING)
375  // process delayed suffixes
376  if (delay_list.count &&
377  (delay_list.count >= DELAY_BUFFER ||
378  subbucket_end == bucket_end))
379  {
380  log_debug(stderr, " sort delayed-suffixes (%u)\n", delay_list.count);
381  timer.start();
382 
383  // and sort the corresponding suffixes
384  if (dcs)
385  {
386  thrust::stable_sort(
387  delay_list.indices,
388  delay_list.indices + delay_list.count,
390  string_len,
391  string,
392  nvbio::plain_view( *dcs ) ) );
393 /*
394  mgpu::MergesortKeys(
395  nvbio::plain_view( d_delayed_suffixes ),
396  delay_list.count,
397  priv::DCS_string_suffix_less<SYMBOL_SIZE,string_type>(
398  string_len,
399  string,
400  nvbio::plain_view( *dcs ) ),
401  *mgpu_ctxt );*/
402  }
403  else
404  {
405  DiscardDelayList discard_list;
406  compression_sort.sort(
407  string_len,
408  string,
409  delay_list.count,
410  delay_list.indices,
411  uint32(-1),
412  uint32(-1),
413  discard_list );
414  }
415 
416  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
417  timer.stop();
418  sufsort_time += timer.seconds();
419  compression_sort.stablesort_time += timer.seconds();
420  //fprintf(stderr," %.1f s\n", timer.seconds());
421 
422  // scatter the delayed suffixes
423  output.process_scattered(
424  delay_list.count,
425  nvbio::plain_view( d_delayed_suffixes ),
426  nvbio::plain_view( d_delayed_slots ) );
427 
428  // clear the list
429  delay_list.clear();
430  }
431  #endif
432 
433  suffix_count += subbucket_size;
434  }
435 
436  log_verbose(stderr,"\r sufsort : %.1fs (%.1f M suffixes/s) \n", sufsort_time, 1.0e-6f*float(global_suffix_offset + suffix_count)/sufsort_time);
437 
438  #if defined(COMPRESSION_SORTING)
439  log_verbose(stderr," extract : %.1fs\n", compression_sort.extract_time);
440  log_verbose(stderr," r-sort : %.1fs\n", compression_sort.radixsort_time);
441  log_verbose(stderr," s-sort : %.1fs\n", compression_sort.stablesort_time);
442  log_verbose(stderr," compress : %.1fs\n", compression_sort.compress_time);
443  log_verbose(stderr," compact : %.1fs\n", compression_sort.compact_time);
444  log_verbose(stderr," bwt-copy : %.1fs\n", bwt_copy_time);
445  log_verbose(stderr," bwt-scat : %.1fs\n", bwt_scatter_time);
446  #endif
447 
448  global_suffix_offset += suffix_count;
449  }
450 }
451 
454 template <typename string_type>
456  DCS& dcs,
457  const typename string_type::index_type string_len,
458  string_type string,
459  BWTParams* params)
460 {
461  typedef typename string_type::index_type index_type;
462 
463  const index_type block_size = 16*1024*1024;
464 
465  //
466  // build the list of DC sample suffixes
467  //
468 
469  // compute the size of the DC sample
470  //const index_type estimated_sample_size = index_type( dcs.estimate_sample_size( string_len ) );
471 
472  thrust::device_vector<uint8> d_temp_storage;
473 
474  index_type estimated_sample_size = 0u;
475 
476  for (index_type block_begin = 0; block_begin < string_len; block_begin += block_size)
477  {
478  const index_type block_end = nvbio::min(
479  block_begin + block_size,
480  string_len );
481 
482  const priv::DCS_predicate in_dcs( dcs.Q, nvbio::plain_view( dcs.d_bitmask ) );
483 
484  estimated_sample_size += cuda::reduce(
485  uint32( block_end - block_begin ),
488  thrust::make_counting_iterator<uint32>(0u) + block_begin, in_dcs ),
490  thrust::plus<uint32>(),
491  d_temp_storage );
492  }
493  log_verbose(stderr," allocating DCS: %.1f MB\n", float(size_t( estimated_sample_size )*8u)/float(1024*1024));
494 
495  thrust::device_vector<uint32> d_sample( estimated_sample_size );
496 
497  index_type sample_size = 0;
498 
499  for (index_type block_begin = 0; block_begin < string_len; block_begin += block_size)
500  {
501  const index_type block_end = nvbio::min(
502  block_begin + block_size,
503  string_len );
504 
505  const priv::DCS_predicate in_dcs( dcs.Q, nvbio::plain_view( dcs.d_bitmask ) );
506 
507  sample_size += cuda::copy_if(
508  uint32( block_end - block_begin ),
509  thrust::make_counting_iterator<uint32>(0u) + block_begin,
510  d_sample.begin() + sample_size,
511  in_dcs,
512  d_temp_storage );
513 
514  if (sample_size > estimated_sample_size)
515  {
516  log_error(stderr," DCS buffer overflow (%llu / %llu)\n", uint64( sample_size ), uint64( estimated_sample_size ));
517  throw runtime_error( "DCS buffer overflow" );
518  }
519  }
520 
521  // alloc enough space for the DC ranks
522  dcs.d_ranks.resize( util::round_i( sample_size, dcs.N ), 0u );
523 
524  // and sort the corresponding suffixes
525  DCSSuffixRanker ranker( nvbio::plain_view( dcs ) );
527  string_len,
528  string,
529  sample_size,
530  d_sample.begin(),
531  ranker,
532  NULL,
533  params );
534 }
535 
536 } // namespace cuda
537 } // namespace nvbio