NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sufsort_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
36 #include <nvbio/sufsort/dcs.h>
38 #include <nvbio/basic/omp.h>
40 #include <nvbio/basic/cuda/sort.h>
41 #include <nvbio/basic/timer.h>
42 #include <thrust/device_vector.h>
43 #include <thrust/transform_scan.h>
44 #include <thrust/binary_search.h>
45 #include <thrust/iterator/constant_iterator.h>
46 #include <thrust/iterator/counting_iterator.h>
47 #include <thrust/sort.h>
48 #include <mgpuhost.cuh>
49 #include <moderngpu.cuh>
50 
51 
52 namespace nvbio {
53 
54 namespace cuda {
55 
56 // return the position of the primary suffix of a string
57 //
58 template <typename string_type>
59 typename string_type::index_type find_primary(
60  const typename string_type::index_type string_len,
61  const string_type string)
62 {
63  NVBIO_VAR_UNUSED const uint32 SYMBOL_SIZE = string_type::SYMBOL_SIZE;
64 
65  // compute the primary by simply counting how many of the suffixes between 1 and N
66  // are lexicographically less than the primary suffix
67  return thrust::transform_reduce(
68  thrust::make_counting_iterator<uint32>(1u),
69  thrust::make_counting_iterator<uint32>(0u) + string_len,
72  0u ),
73  0u,
74  thrust::plus<uint32>() ) + 1u;
75 }
76 
77 // Sort the suffixes of all the strings in the given string_set
78 //
79 template <typename string_set_type, typename output_handler>
81  const string_set_type& string_set,
82  output_handler& output,
83  BWTParams* params)
84 {
85  typedef uint32 word_type;
86  NVBIO_VAR_UNUSED const uint32 WORD_BITS = uint32( 8u * sizeof(word_type) );
87  NVBIO_VAR_UNUSED const uint32 DOLLAR_BITS = 4;
88 
89  NVBIO_VAR_UNUSED const uint32 SYMBOL_SIZE = 2u;
90  NVBIO_VAR_UNUSED const uint32 SYMBOLS_PER_WORD = priv::symbols_per_word<SYMBOL_SIZE,WORD_BITS,DOLLAR_BITS>();
91 
92  int current_device;
93  cudaGetDevice( &current_device );
94  mgpu::ContextPtr mgpu_ctxt = mgpu::CreateCudaDevice( current_device );
95 
96  // instantiate a suffix flattener on the string set
97  priv::SetSuffixFlattener<SYMBOL_SIZE> suffixes( mgpu_ctxt );
98  suffixes.set( string_set );
99 
100  // compute the maximum number of words needed to represent a suffix
101  const uint32 m = (suffixes.max_length( string_set ) + SYMBOLS_PER_WORD-1) / SYMBOLS_PER_WORD;
102 
103  // compute the number of suffixes
104  const uint32 n_suffixes = suffixes.n_suffixes;
105 
106  thrust::device_vector<word_type> radices( n_suffixes*2 );
107  thrust::device_vector<uint32> indices( n_suffixes*2 );
108 
109  // initialize the list of suffix indices
110  thrust::copy(
111  thrust::make_counting_iterator<uint32>(0u),
112  thrust::make_counting_iterator<uint32>(n_suffixes),
113  indices.begin() );
114 
116  cuda::SortEnactor sort_enactor;
117 
118  sort_buffers.selector = 0;
119  sort_buffers.keys[0] = nvbio::device_view( radices );
120  sort_buffers.keys[1] = nvbio::device_view( radices ) + n_suffixes;
121  sort_buffers.values[0] = nvbio::device_view( indices );
122  sort_buffers.values[1] = nvbio::device_view( indices ) + n_suffixes;
123 
124  // do what is essentially an LSD radix-sort on the suffixes, word by word
125  for (int32 word_idx = m-1; word_idx >= 0; --word_idx)
126  {
127  // extract the given radix word from each of the partially sorted suffixes
128  suffixes.flatten(
129  string_set,
130  word_idx,
132  indices.begin() + sort_buffers.selector * n_suffixes,
133  radices.begin() + sort_buffers.selector * n_suffixes );
134 
135  // and sort them
136  sort_enactor.sort( n_suffixes, sort_buffers );
137  }
138 
139  output.process(
140  n_suffixes,
141  nvbio::device_view( indices ) + sort_buffers.selector * n_suffixes,
142  nvbio::device_view( suffixes.string_ids ),
143  nvbio::device_view( suffixes.cum_lengths ));
144 }
145 
146 // Sort all the suffixes of a given string
147 //
148 template <typename string_type, typename output_iterator>
150  const typename stream_traits<string_type>::index_type string_len,
151  const string_type string,
152  output_iterator output,
153  BWTParams* params)
154 {
155  PrefixDoublingSufSort sufsort;
156  sufsort.sort(
157  string_len,
158  string,
159  output + 1u );
160 
161  // assign the zero'th suffix
162  output[0] = string_len;
163 
164  NVBIO_CUDA_DEBUG_STATEMENT( log_verbose(stderr," extract : %5.1f ms\n", 1.0e3f * sufsort.extract_time) );
165  NVBIO_CUDA_DEBUG_STATEMENT( log_verbose(stderr," gather : %5.1f ms\n", 1.0e3f * sufsort.gather_time) );
166  NVBIO_CUDA_DEBUG_STATEMENT( log_verbose(stderr," r-sort : %5.1f ms\n", 1.0e3f * sufsort.radixsort_time) );
167  NVBIO_CUDA_DEBUG_STATEMENT( log_verbose(stderr," segment : %5.1f ms\n", 1.0e3f * sufsort.segment_time) );
168  NVBIO_CUDA_DEBUG_STATEMENT( log_verbose(stderr," invert : %5.1f ms\n", 1.0e3f * sufsort.inverse_time) );
169  NVBIO_CUDA_DEBUG_STATEMENT( log_verbose(stderr," compact : %5.1f ms\n", 1.0e3f * sufsort.compact_time) );
170 }
171 
172 // Sort all the suffixes of a given string
173 //
174 template <typename string_type, typename output_handler>
176  const typename string_type::index_type string_len,
177  string_type string,
178  output_handler& output,
179  BWTParams* params)
180 {
181  typedef typename string_type::index_type index_type;
182 
183  // find a suitable Difference Cover...
184  const size_t needed_bytes_64 = size_t( DCS::estimated_sample_size<64>( string_len ) ) * 8u;
185  const size_t needed_bytes_128 = size_t( DCS::estimated_sample_size<128>( string_len ) ) * 8u;
186  const size_t needed_bytes_256 = size_t( DCS::estimated_sample_size<256>( string_len ) ) * 8u;
187  const size_t needed_bytes_512 = size_t( DCS::estimated_sample_size<512>( string_len ) ) * 8u;
188  const size_t needed_bytes_1024 = size_t( DCS::estimated_sample_size<1024>( string_len ) ) * 8u;
189 
190  size_t free, total;
191  cudaMemGetInfo(&free, &total);
192 
193  DCS dcs;
194 
195  if (free >= 2*needed_bytes_64)
196  dcs.init<64>();
197  else if (free >= 2*needed_bytes_128)
198  dcs.init<128>();
199  else if (free >= 2*needed_bytes_256)
200  dcs.init<256>();
201  else if (free >= 2*needed_bytes_512)
202  dcs.init<512>();
203  else if (free >= 2*needed_bytes_1024)
204  dcs.init<1024>();
205  else
206  dcs.init<2048>();
207 
208  // build a table for our Difference Cover
209  log_verbose(stderr, " building DCS-%u... started\n", dcs.Q);
210 
212  dcs,
213  string_len,
214  string,
215  params );
216 
217  log_verbose(stderr, " building DCS-%u... done\n", dcs.Q);
218 
219  // and do the Difference Cover based sorting
220  log_verbose(stderr, " DCS-based sorting... started\n");
221 
223  string_len,
224  string,
225  string_len,
226  thrust::make_counting_iterator<uint32>(0u),
227  output,
228  &dcs,
229  params );
230 
231  log_verbose(stderr, " DCS-based sorting... done\n");
232 }
233 
234 // Compute the bwt of a device-side string
235 //
236 // \return position of the primary suffix / $ symbol
237 //
238 template <typename string_type, typename output_iterator>
239 typename string_type::index_type bwt(
240  const typename string_type::index_type string_len,
241  string_type string,
242  output_iterator output,
243  BWTParams* params)
244 {
245  typedef typename string_type::index_type index_type;
246 
247  // build a BWT handler
249  string_len,
250  string,
251  output );
252 
253  // and pass it to the blockwise suffix sorter
255  string_len,
256  string,
257  bwt_handler,
258  params );
259 
260  log_verbose(stderr,"\n primary at %llu\n", bwt_handler.primary);
261 
262  // shift back all symbols following the primary
263  bwt_handler.remove_dollar();
264 
265  return bwt_handler.primary;
266 }
267 
268 template <uint32 BUCKETING_BITS_T, uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type>
270 {
271  typedef typename std::iterator_traits<storage_type>::value_type word_type;
272 
273  static const uint32 WORD_BITS = uint32( 8u * sizeof(word_type) );
274  static const uint32 DOLLAR_BITS = WORD_BITS <= 32 ? 4 : 5;
275  static const uint32 BUCKETING_BITS = BUCKETING_BITS_T;
276 
277  typedef ConcatenatedStringSet<
280 
282 
285  SYMBOL_SIZE,BIG_ENDIAN,storage_type,
288 };
289 
290 template <uint32 BUCKETING_BITS_T, uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type>
292 {
293  typedef typename std::iterator_traits<storage_type>::value_type word_type;
294 
295  static const uint32 WORD_BITS = uint32( 8u * sizeof(word_type) );
296  static const uint32 DOLLAR_BITS = WORD_BITS <= 32 ? 4 : 5;
297  static const uint32 BUCKETING_BITS = BUCKETING_BITS_T;
298 
299  typedef ConcatenatedStringSet<
302 
304 
307  SYMBOL_SIZE,BIG_ENDIAN,storage_type,
309 };
310 
311 template <uint32 BUCKETING_BITS_T, uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type>
313 {
314  typedef typename std::iterator_traits<storage_type>::value_type word_type;
315 
316  static const uint32 WORD_BITS = uint32( 8u * sizeof(word_type) );
317  static const uint32 DOLLAR_BITS = WORD_BITS <= 32 ? 4 : 5;
318  static const uint32 BUCKETING_BITS = BUCKETING_BITS_T;
319 
320  typedef ConcatenatedStringSet<
323 
325 
328  SYMBOL_SIZE,BIG_ENDIAN,storage_type,
331 };
332 
333 // simple status class
335 {
336  enum {
337  OK = 0,
339  };
340 
341  // default constructor
343 
344  // return whether the status is OK
345  operator bool() const { return code == OK ? true : false; }
346 
350 };
351 
352 template <typename ConfigType, uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type>
354 {
355  typedef typename std::iterator_traits<storage_type>::value_type word_type;
356  typedef typename ConfigType::string_set_handler string_set_handler_type;
357  typedef typename ConfigType::bucket_type bucket_type;
358  typedef typename ConfigType::suffix_bucketer suffix_bucketer_type;
359 
360  typedef ConcatenatedStringSet<
363 
364  // compute the maximum sub-bucket size
365  //
367  const thrust::host_vector<uint32>& h_buckets,
368  const uint64 max_super_block_size,
369  const uint32 limit,
370  LargeBWTStatus* status)
371  {
372  NVBIO_VAR_UNUSED const uint32 DOLLAR_BITS = ConfigType::DOLLAR_BITS;
373  NVBIO_VAR_UNUSED const uint32 DOLLAR_MASK = (1u << DOLLAR_BITS) - 1u;
374 
375  uint32 max_size = 0u;
376  uint32 max_index = 0u;
377 
378  // build the subbucket pointers
379  for (uint32 bucket_begin = 0, bucket_end = 0; bucket_begin < h_buckets.size(); bucket_begin = bucket_end)
380  {
381  // grow the block of buckets until we can
382  uint64 bucket_size;
383  for (bucket_size = 0; (bucket_end < h_buckets.size()) && (bucket_size + h_buckets[bucket_end] <= max_super_block_size); ++bucket_end)
384  bucket_size += h_buckets[bucket_end];
385 
386  // check whether a single bucket exceeds our host buffer capacity
387  // TODO: if this is a short-string bucket, we could handle it with special care,
388  // but it requires modifying the collecting loop to output everything directly.
389  if (bucket_end == bucket_begin)
390  throw nvbio::runtime_error("bucket %u contains %u strings: buffer overflow!", bucket_begin, h_buckets[bucket_begin]);
391 
392  // loop through the sub-buckets
393  for (uint32 subbucket = bucket_begin; subbucket < bucket_end; ++subbucket)
394  {
395  // only keep track of buckets that are NOT short-string buckets
396  if ((subbucket & DOLLAR_MASK) == DOLLAR_MASK)
397  {
398  if (max_size < h_buckets[subbucket])
399  {
400  max_size = h_buckets[subbucket];
401  max_index = subbucket;
402  }
403  }
404  }
405  }
406 
407  if (max_size > limit)
408  {
410  status->bucket_size = max_size;
411  status->bucket_index = max_index;
412  }
413 
414  return max_size;
415  }
416 
417  // construct the sub-bucket lists
418  //
419  static void build_subbuckets(
420  const thrust::host_vector<uint32>& h_buckets,
421  thrust::host_vector<uint32>& h_subbuckets,
422  const uint64 max_super_block_size,
423  const uint32 max_block_size)
424  {
425  NVBIO_VAR_UNUSED const uint32 DOLLAR_BITS = ConfigType::DOLLAR_BITS;
426  NVBIO_VAR_UNUSED const uint32 DOLLAR_MASK = (1u << DOLLAR_BITS) - 1u;
427 
428  // build the subbucket pointers
429  for (uint32 bucket_begin = 0, bucket_end = 0; bucket_begin < h_buckets.size(); bucket_begin = bucket_end)
430  {
431  // grow the block of buckets until we can
432  uint64 bucket_size;
433  for (bucket_size = 0; (bucket_end < h_buckets.size()) && (bucket_size + h_buckets[bucket_end] <= max_super_block_size); ++bucket_end)
434  bucket_size += h_buckets[bucket_end];
435 
436  // check whether a single bucket exceeds our host buffer capacity
437  // TODO: if this is a short-string bucket, we could handle it with special care,
438  // but it requires modifying the collecting loop to output everything directly.
439  if (bucket_end == bucket_begin)
440  throw nvbio::runtime_error("bucket %u contains %u strings: buffer overflow!", bucket_begin, h_buckets[bucket_begin]);
441 
442  // build the sub-buckets
443  for (uint32 subbucket_begin = bucket_begin, subbucket_end = bucket_begin; subbucket_begin < bucket_end; subbucket_begin = subbucket_end)
444  {
445  if (h_buckets[subbucket_begin] > max_block_size)
446  {
447  // if this is NOT a short-string bucket, we can't cope with it
448  if ((subbucket_begin & DOLLAR_MASK) == DOLLAR_MASK)
449  throw nvbio::runtime_error("bucket %u contains %u strings: buffer overflow!", subbucket_begin, h_buckets[subbucket_begin]);
450 
451  // this is a short-string bucket: we can handle it with special care
452  h_subbuckets[ subbucket_end++ ] = subbucket_begin; // point to the beginning of this sub-bucket
453  }
454  else
455  {
456  // grow the block of sub-buckets until we can
457  uint32 subbucket_size;
458  for (subbucket_size = 0; (subbucket_end < bucket_end) && (subbucket_size + h_buckets[subbucket_end] <= max_block_size); ++subbucket_end)
459  {
460  subbucket_size += h_buckets[subbucket_end];
461 
462  h_subbuckets[ subbucket_end ] = subbucket_begin; // point to the beginning of this sub-bucket
463  }
464  }
465  }
466  }
467  }
468 
469  template <typename output_handler>
471  const string_set_type string_set,
472  output_handler& output,
473  BWTParams* params)
474  {
475  NVBIO_VAR_UNUSED const uint32 BUCKETING_BITS = ConfigType::BUCKETING_BITS;
476  NVBIO_VAR_UNUSED const uint32 DOLLAR_BITS = ConfigType::DOLLAR_BITS;
477  NVBIO_VAR_UNUSED const uint32 DOLLAR_MASK = (1u << DOLLAR_BITS) - 1u;
478  NVBIO_VAR_UNUSED const uint32 SLICE_SIZE = params ? params->radix_slice : 4u;
479 
480  const uint32 N = string_set.size();
481 
482  LargeBWTStatus status;
483 
484  // allocate an MGPU context
485  int current_device;
486  cudaGetDevice( &current_device );
487  mgpu::ContextPtr mgpu_ctxt = mgpu::CreateCudaDevice( current_device );
488 
489  suffix_bucketer_type bucketer( mgpu_ctxt );
490  string_set_handler_type string_set_handler( string_set );
491  cuda::CompressionSort string_sorter( mgpu_ctxt );
492 
493  priv::DollarExtractor dollars;
494 
495  const uint64 max_super_block_size = params ? // requires max_super_block_size*8 host memory bytes
496  (params->host_memory - uint64(128u*1024u*1024u)) / 8u : // leave 128MB for the bucket counters
497  512*1024*1024;
498  uint32 max_block_size = params ?
499  params->device_memory / 32 : // requires max_block_size*32 device memory bytes
500  32*1024*1024; // default: 1GB
501 
502  log_verbose(stderr," super-block-size: %.1f M\n", float(max_super_block_size)/float(1024*1024));
503  log_verbose(stderr," block-size: %.1f M\n", float(max_block_size)/float(1024*1024));
504  thrust::host_vector<uint2> h_suffixes( max_super_block_size );
505  thrust::host_vector<uint8> h_block_bwt;
506 
507  // reuse some buffers
508  thrust::device_vector<uint32> d_indices;
509  thrust::device_vector<uint2> d_bucket_suffixes;
510  thrust::device_vector<uint8> d_block_bwt;
511  //thrust::device_vector<uint8> d_temp_storage;
512 
513  //
514  // split the suffixes in buckets, and count them
515  //
516 
517  const uint32 n_buckets = 1u << BUCKETING_BITS;
518 
519  thrust::host_vector<uint32> h_buckets( n_buckets );
520  thrust::host_vector<uint32> h_subbuckets( n_buckets );
521 
522  // count how many suffixes fall in each bucket
523  const uint64 total_suffixes = bucketer.count( string_set, h_buckets );
524 
525  // find the maximum bucket size
526  const uint32 max_bucket_size = thrust::reduce(
527  h_buckets.begin(),
528  h_buckets.end(),
529  0u,
530  thrust::maximum<uint32>() );
531 
532  // compute the largest non-elementary bucket
533  const uint32 largest_subbucket = max_subbucket_size( h_buckets, max_super_block_size, max_block_size, &status );
534  if (!status)
535  {
536  log_verbose(stderr," exceeded maximum bucket size\n");
537  return status;
538  }
539 
540  log_verbose(stderr," max bucket size: %u (%u)\n", largest_subbucket, max_bucket_size);
541  bucketer.log_count_stats();
542 
543  float bwt_time = 0.0f;
544  float output_time = 0.0f;
545 
546  // output the last character of each string (i.e. the symbols preceding all the dollar signs)
547  const uint32 block_size = max_block_size / 4u; // this can be done in relatively small blocks
548  for (uint32 block_begin = 0; block_begin < N; block_begin += block_size)
549  {
550  const uint32 block_end = nvbio::min( block_begin + block_size, N );
551 
552  // consume subbucket_size suffixes
553  const uint32 n_suffixes = block_end - block_begin;
554 
555  Timer timer;
556  timer.start();
557 
558  priv::alloc_storage( h_block_bwt, n_suffixes );
559  priv::alloc_storage( d_block_bwt, n_suffixes );
560 
561  // load the BWT symbols
562  string_set_handler.dollar_bwt(
563  block_begin,
564  block_end,
565  plain_view( h_block_bwt ) );
566 
567  // copy them to the device
568  thrust::copy(
569  h_block_bwt.begin(),
570  h_block_bwt.begin() + n_suffixes,
571  d_block_bwt.begin() );
572 
573  timer.stop();
574  bwt_time += timer.seconds();
575 
576  timer.start();
577 
578  // extract the dollars
579  const uint32 n_dollars = dollars.extract(
580  n_suffixes,
581  raw_pointer( h_block_bwt ),
582  raw_pointer( d_block_bwt ),
583  NULL,
584  NULL,
585  NULL );
586 
587  // invoke the output handler
588  output.process(
589  n_suffixes,
590  raw_pointer( h_block_bwt ),
591  n_dollars,
592  raw_pointer( dollars.h_dollar_ranks ),
593  raw_pointer( dollars.h_dollars ) );
594 
595  timer.stop();
596  output_time += timer.seconds();
597  }
598 
599  bucketer.clear_timers();
600 
601  //
602  // at this point, we have to do multiple passes through the input string set,
603  // collecting in each pass as many buckets as we can fit in memory at once
604  //
605 
606  float sufsort_time = 0.0f;
607  float collect_time = 0.0f;
608 
609  // reduce the scratchpads size if possible
610  const uint32 optimal_block_size = 32*1024*1024;
611  if (largest_subbucket <= optimal_block_size)
612  max_block_size = optimal_block_size;
613 
614  // reserve memory for scratchpads
615  {
616  log_verbose(stderr," allocating scratchpads\n" );
617 
618  string_set_handler.reserve( max_block_size, SLICE_SIZE );
619  string_sorter.reserve( max_block_size );
620 
621  priv::alloc_storage( h_block_bwt, max_block_size );
622  priv::alloc_storage( d_block_bwt, max_block_size );
623  priv::alloc_storage( d_indices, max_block_size );
624  priv::alloc_storage( d_bucket_suffixes, max_block_size );
625 
626  log_verbose(stderr," allocated device memory: %.1f MB\n",
627  float( bucketer.allocated_device_memory() +
628  string_set_handler.allocated_device_memory() +
629  string_sorter.allocated_device_memory() +
630  d_block_bwt.size() * sizeof(uint8) +
631  d_indices.size() * sizeof(uint32) +
632  d_bucket_suffixes.size() * sizeof(uint2)
633  ) / float(1024*1024) );
634  log_verbose(stderr," bucketer : %.1f MB\n", float( bucketer.allocated_device_memory() ) / float(1024*1024) );
635  log_verbose(stderr," handler : %.1f MB\n", float( string_set_handler.allocated_device_memory() ) / float(1024*1024) );
636  log_verbose(stderr," sorter : %.1f MB\n", float( string_sorter.allocated_device_memory() ) / float(1024*1024) );
637 
638  log_verbose(stderr," allocated host memory: %.1f MB\n",
639  float( bucketer.allocated_host_memory() +
640  string_set_handler.allocated_host_memory() +
641  h_block_bwt.size() * sizeof(uint8) +
642  h_suffixes.size() * sizeof(uint2) +
643  h_buckets.size() * sizeof(uint32) +
644  h_subbuckets.size() * sizeof(uint32)
645  ) / float(1024*1024) );
646  }
647 
648  // now build the sub-bucket lists
650  h_buckets,
651  h_subbuckets,
652  max_super_block_size,
653  max_block_size );
654 
655  uint64 global_suffix_offset = 0;
656 
657  for (uint32 bucket_begin = 0, bucket_end = 0; bucket_begin < h_buckets.size(); bucket_begin = bucket_end)
658  {
659  // grow the block of buckets until we can
660  uint64 bucket_size;
661  for (bucket_size = 0; (bucket_end < h_buckets.size()) && (bucket_size + h_buckets[bucket_end] <= max_super_block_size); ++bucket_end)
662  bucket_size += h_buckets[bucket_end];
663 
664  //
665  // do a global scan to find all suffixes falling in the current super-bucket
666  //
667 
668  uint64 suffix_count = 0;
669  uint32 max_suffix_len = 0;
670 
671  log_verbose(stderr," collect buckets[%u:%u] (%llu suffixes)\n", bucket_begin, bucket_end, bucket_size);
672  Timer collect_timer;
673  collect_timer.start();
674 
675  suffix_count = bucketer.collect(
676  string_set,
677  bucket_begin,
678  bucket_end,
679  max_suffix_len,
680  h_subbuckets,
681  h_suffixes );
682 
683  collect_timer.stop();
684  collect_time += collect_timer.seconds();
685  log_verbose(stderr," collect : %.1fs (%.1f M suffixes/s - %.1f M scans/s)\n", collect_time, 1.0e-6f*float(global_suffix_offset + suffix_count)/collect_time, 1.0e-6f*float(total_suffixes)/collect_time);
686  bucketer.log_collect_stats();
687 
688  //
689  // at this point we have a large collection of localized suffixes to sort in h_suffixes;
690  // we'll do it looping on multiple sub-buckets, on the GPU
691  //
692 
693  suffix_count = 0u;
694 
695  const uint32 n_words = string_set_handler.num_words( max_suffix_len );
696 
697  for (uint32 subbucket_begin = bucket_begin, subbucket_end = bucket_begin; subbucket_begin < bucket_end; subbucket_begin = subbucket_end)
698  {
699  if (h_buckets[subbucket_begin] > max_block_size)
700  {
701  // check if this is not a short-string bucket - it should never actually happen as we already tested for it
702  if ((subbucket_begin & DOLLAR_MASK) == DOLLAR_MASK)
703  throw nvbio::runtime_error("bucket %u contains %u strings: overflow!", subbucket_begin, h_buckets[subbucket_begin]);
704 
705  // advance by one
706  ++subbucket_end;
707 
708  const uint32 subbucket_size = h_buckets[subbucket_begin];
709 
710  Timer suf_timer;
711  suf_timer.start();
712 
713  // chop the bucket in multiple blocks
714  for (uint32 block_begin = 0; block_begin < subbucket_size; block_begin += max_block_size)
715  {
716  const uint32 block_end = nvbio::min( block_begin + max_block_size, subbucket_size );
717 
718  // consume subbucket_size suffixes
719  const uint32 n_suffixes = block_end - block_begin;
720 
721  // copy the host suffixes to the device
722  const uint2* h_bucket_suffixes = &h_suffixes[0] + suffix_count + block_begin;
723 
724  // copy the suffix list to the device
725  priv::alloc_storage( d_bucket_suffixes, n_suffixes );
726  thrust::copy(
727  h_bucket_suffixes,
728  h_bucket_suffixes + n_suffixes,
729  d_bucket_suffixes.begin() );
730 
731  // initialize the set radices
732  string_set_handler.init( n_suffixes, h_bucket_suffixes, nvbio::plain_view( d_bucket_suffixes ) );
733 
734  Timer timer;
735  timer.start();
736 
737  priv::alloc_storage( h_block_bwt, n_suffixes );
738  priv::alloc_storage( d_block_bwt, n_suffixes );
739 
740  // load the BWT symbols
741  string_set_handler.bwt(
742  n_suffixes,
743  (const uint32*)NULL,
744  plain_view( h_block_bwt ),
745  plain_view( d_block_bwt ) );
746 
747  timer.stop();
748  bwt_time += timer.seconds();
749 
750  timer.start();
751 
752  // extract the dollars
753  const uint32 n_dollars = dollars.extract(
754  n_suffixes,
755  raw_pointer( h_block_bwt ),
756  raw_pointer( d_block_bwt ),
757  h_bucket_suffixes,
758  raw_pointer( d_bucket_suffixes ),
759  NULL );
760 
761  // invoke the output handler
762  output.process(
763  n_suffixes,
764  raw_pointer( h_block_bwt ),
765  n_dollars,
766  raw_pointer( dollars.h_dollar_ranks ),
767  raw_pointer( dollars.h_dollars ) );
768 
769  timer.stop();
770  output_time += timer.seconds();
771  }
772 
773  suffix_count += subbucket_size;
774 
775  suf_timer.stop();
776  sufsort_time += suf_timer.seconds();
777  }
778  else
779  {
780  // grow the block of sub-buckets until we can
781  uint32 subbucket_size;
782  for (subbucket_size = 0; (subbucket_end < bucket_end) && (subbucket_size + h_buckets[subbucket_end] <= max_block_size); ++subbucket_end)
783  subbucket_size += h_buckets[subbucket_end];
784 
785  log_verbose(stderr,"\r sufsort buckets[%u:%u] (%.1f M suffixes/s) ", subbucket_begin, subbucket_end, 1.0e-6f*float(global_suffix_offset + suffix_count)/sufsort_time);
786  if (subbucket_size == 0)
787  continue;
788 
789  // consume subbucket_size suffixes
790  const uint32 n_suffixes = subbucket_size;
791 
792  try
793  {
794  // reserve enough space
795  priv::alloc_storage( d_indices, max_block_size );
796  }
797  catch (...)
798  {
799  log_error(stderr, "LargeBWTSkeleton: d_indices allocation failed!\n");
800  throw;
801  }
802 
803  Timer suf_timer;
804  suf_timer.start();
805 
806  // copy the host suffixes to the device
807  const uint2* h_bucket_suffixes = &h_suffixes[0] + suffix_count;
808 
809  priv::alloc_storage( d_bucket_suffixes, n_suffixes );
810 
811  // copy the suffix list to the device
812  thrust::copy(
813  h_bucket_suffixes,
814  h_bucket_suffixes + n_suffixes,
815  d_bucket_suffixes.begin() );
816 
817  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, "\n initialize radices\n" ) );
818 
819  // initialize the set radices
820  string_set_handler.init( n_suffixes, h_bucket_suffixes, nvbio::plain_view( d_bucket_suffixes ) );
821 
822  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " sort strings\n" ) );
823 
824  cuda::DiscardDelayList delay_list;
825 
826  string_sorter.sort(
827  string_set_handler,
828  n_suffixes,
829  n_words,
830  thrust::make_counting_iterator<uint32>(0u),
831  d_indices.begin(),
832  uint32(-1),
833  delay_list,
834  SLICE_SIZE );
835 
836  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " BWT transform\n" ) );
837 
838  Timer timer;
839  timer.start();
840 
841  priv::alloc_storage( h_block_bwt, n_suffixes );
842  priv::alloc_storage( d_block_bwt, n_suffixes );
843 
844  // load the BWT symbols
845  string_set_handler.bwt(
846  n_suffixes,
847  plain_view( d_indices ),
848  plain_view( h_block_bwt ),
849  plain_view( d_block_bwt ) );
850 
851  timer.stop();
852  bwt_time += timer.seconds();
853 
854  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " BWT output\n" ) );
855 
856  timer.start();
857 
858  // invoke the output handler
859  const uint32 n_dollars = dollars.extract(
860  n_suffixes,
861  raw_pointer( h_block_bwt ),
862  raw_pointer( d_block_bwt ),
863  h_bucket_suffixes,
864  raw_pointer( d_bucket_suffixes ),
865  raw_pointer( d_indices ) );
866 
867  // invoke the output handler
868  output.process(
869  n_suffixes,
870  raw_pointer( h_block_bwt ),
871  n_dollars,
872  raw_pointer( dollars.h_dollar_ranks ),
873  raw_pointer( dollars.h_dollars ) );
874 
875  timer.stop();
876  output_time += timer.seconds();
877 
878  suffix_count += subbucket_size;
879 
880  suf_timer.stop();
881  sufsort_time += suf_timer.seconds();
882  }
883  }
884  log_verbose(stderr,"\r sufsort : %.1fs (%.1f M suffixes/s) \n", sufsort_time, 1.0e-6f*float(global_suffix_offset + suffix_count)/sufsort_time);
885  log_verbose(stderr," copy : %.1fs\n", string_sorter.copy_time);
886  log_verbose(stderr," extract : %.1fs\n", string_sorter.extract_time);
887  log_verbose(stderr," r-sort : %.1fs\n", string_sorter.radixsort_time);
888  log_verbose(stderr," compress : %.1fs\n", string_sorter.compress_time);
889  log_verbose(stderr," compact : %.1fs\n", string_sorter.compact_time);
890  log_verbose(stderr," scatter : %.1fs\n", string_sorter.scatter_time);
891  log_verbose(stderr," bwt : %.1fs\n", bwt_time);
892  log_verbose(stderr," output : %.1fs\n", output_time);
893 
894  global_suffix_offset += suffix_count;
895  }
896  return status;
897  }
898 };
899 
900 // Compute the bwt of a device-side string set
901 //
902 template <uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type, typename output_handler>
903 void bwt(
904  const ConcatenatedStringSet<
906  uint64*> string_set,
907  output_handler& output,
908  BWTParams* params)
909 {
910  typedef cuda::DeviceBWTConfig<16,SYMBOL_SIZE,BIG_ENDIAN,storage_type> config_type_16; // 16-bits bucketing
911  typedef cuda::DeviceBWTConfig<20,SYMBOL_SIZE,BIG_ENDIAN,storage_type> config_type_20; // 20-bits bucketing
912  typedef cuda::DeviceBWTConfig<24,SYMBOL_SIZE,BIG_ENDIAN,storage_type> config_type_24; // 24-bits bucketing
913 
914  cuda::LargeBWTStatus status;
915 
916  // try 16-bit bucketing
918  string_set,
919  output,
920  params ))
921  return;
922 
923  // try 20-bit bucketing
925  string_set,
926  output,
927  params ))
928  return;
929 
930  // try 24-bit bucketing
932  string_set,
933  output,
934  params ))
935  return;
936 
937  if (status.code == LargeBWTStatus::LargeBucket)
938  throw nvbio::runtime_error("subbucket %u contains %u strings: buffer overflow!\n please try increasing the device memory limit to at least %u MB\n", status.bucket_index, status.bucket_size, util::divide_ri( status.bucket_size, 1024u*1024u )*32u);
939 }
940 
941 } // namespace cuda
942 
943 // Compute the bwt of a host-side string set
944 //
945 template <uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type, typename output_handler>
947  const ConcatenatedStringSet<
949  uint64*> string_set,
950  output_handler& output,
951  BWTParams* params)
952 {
953  cuda::LargeBWTStatus status;
954 
955  const uint32 bucketing_bits = params ? params->bucketing_bits : 16u;
956 
957  if (params && params->cpu_bucketing)
958  {
959  typedef cuda::HostBWTConfigCPUBucketer<16,SYMBOL_SIZE,BIG_ENDIAN,storage_type> config_type_16; // 16-bits bucketing
960  typedef cuda::HostBWTConfigCPUBucketer<20,SYMBOL_SIZE,BIG_ENDIAN,storage_type> config_type_20; // 20-bits bucketing
961  typedef cuda::HostBWTConfigCPUBucketer<24,SYMBOL_SIZE,BIG_ENDIAN,storage_type> config_type_24; // 24-bits bucketing
962  typedef cuda::HostBWTConfigCPUBucketer<26,SYMBOL_SIZE,BIG_ENDIAN,storage_type> config_type_26; // 26-bits bucketing
963 
964  // try 16-bit bucketing
965  if (bucketing_bits <= 16u)
966  {
968  string_set,
969  output,
970  params ))
971  return;
972  }
973 
974  // try 20-bit bucketing
975  if (bucketing_bits <= 20u)
976  {
978  string_set,
979  output,
980  params ))
981  return;
982  }
983 
984  // try 24-bit bucketing
985  if (bucketing_bits <= 24u)
986  {
988  string_set,
989  output,
990  params ))
991  return;
992  }
993 
994  // try 26-bit bucketing
996  string_set,
997  output,
998  params ))
999  return;
1000  }
1001  else
1002  {
1003  typedef cuda::HostBWTConfigGPUBucketer<16,SYMBOL_SIZE,BIG_ENDIAN,storage_type> config_type_16; // 16-bits bucketing
1004  typedef cuda::HostBWTConfigGPUBucketer<20,SYMBOL_SIZE,BIG_ENDIAN,storage_type> config_type_20; // 20-bits bucketing
1005  typedef cuda::HostBWTConfigGPUBucketer<24,SYMBOL_SIZE,BIG_ENDIAN,storage_type> config_type_24; // 24-bits bucketing
1006  typedef cuda::HostBWTConfigGPUBucketer<26,SYMBOL_SIZE,BIG_ENDIAN,storage_type> config_type_26; // 26-bits bucketing
1007 
1008  // try 16-bit bucketing
1009  if (bucketing_bits <= 16u)
1010  {
1012  string_set,
1013  output,
1014  params ))
1015  return;
1016  }
1017 
1018  // try 20-bit bucketing
1019  if (bucketing_bits <= 20u)
1020  {
1022  string_set,
1023  output,
1024  params ))
1025  return;
1026  }
1027 
1028  // try 24-bit bucketing
1029  if (bucketing_bits <= 24u)
1030  {
1032  string_set,
1033  output,
1034  params ))
1035  return;
1036  }
1037 
1038  // try 26-bit bucketing
1040  string_set,
1041  output,
1042  params ))
1043  return;
1044  }
1045 
1047  throw nvbio::runtime_error("subbucket %u contains %u strings: buffer overflow!\n please try increasing the device memory limit to at least %u MB\n", status.bucket_index, status.bucket_size, util::divide_ri( status.bucket_size, 1024u*1024u )*32u);
1048 }
1049 
1050 } // namespace nvbio