NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
prefix_doubling_sufsort.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
33 #include <nvbio/basic/cuda/sort.h>
34 #include <nvbio/basic/cuda/ldg.h>
35 #include <thrust/device_vector.h>
36 #include <thrust/transform_scan.h>
37 #include <thrust/binary_search.h>
38 #include <thrust/iterator/constant_iterator.h>
39 #include <thrust/iterator/counting_iterator.h>
40 #include <mgpuhost.cuh>
41 #include <moderngpu.cuh>
42 #include <cub/cub.cuh>
43 
44 namespace nvbio {
45 namespace cuda {
46 
52 {
56  extract_time(0.0f),
57  gather_time(0.0f),
58  radixsort_time(0.0f),
59  segment_time(0.0f),
60  compact_time(0.0f),
61  inverse_time(0.0f) { m_mgpu = mgpu::CreateCudaDevice(0); }
62 
72  template <typename string_type, typename output_iterator>
73  void sort(
74  const typename stream_traits<string_type>::index_type string_len,
75  const string_type string,
76  output_iterator d_suffixes);
77 
80  void reserve(const uint32 n)
81  {
82  if (n > d_active_slots.size())
83  {
84  clear();
85 
86  d_inv_keys.resize( n );
87  d_active_slots.resize( n + 4 );
88  d_sort_keys.resize( n + 4 );
89  d_sort_indices.resize( n + 4 );
90  d_partial_flags.resize( n + 16 );
91  d_temp_flags.resize( n + 16 );
92  d_segment_flags.resize( n + 16 );
93  d_segment_keys.resize( n + 4 );
94  d_segment_heads.resize( n + 4 );
95  d_segment_blocks.resize( ((n+4)+127/128) );
96  }
97  }
100  void clear()
101  {
102  d_inv_keys.clear();
103  d_sort_keys.clear();
104  d_sort_indices.clear();
105  d_partial_flags.clear();
106  d_temp_flags.clear();
107  d_segment_flags.clear();
108  d_segment_keys.clear();
109  d_segment_heads.clear();
110  d_active_slots.clear();
111  d_segment_blocks.clear();
112  }
113 
114  float extract_time;
115  float gather_time;
117  float segment_time;
118  float compact_time;
119  float inverse_time;
120 
121 private:
122  thrust::device_vector<uint32> d_inv_keys;
123  thrust::device_vector<uint32> d_sort_keys;
124  thrust::device_vector<uint32> d_sort_indices;
125  thrust::device_vector<uint32> d_active_slots;
126  thrust::device_vector<uint8> d_segment_flags;
127  thrust::device_vector<uint8> d_partial_flags;
128  thrust::device_vector<uint8> d_temp_flags;
129  thrust::device_vector<uint32> d_segment_keys;
130  thrust::device_vector<uint32> d_segment_heads;
131  thrust::device_vector<uint32> d_segment_blocks;
132  mgpu::ContextPtr m_mgpu;
133 };
134 
137 #define VECTORIZED_PREFIX_DOUBLING
138 #if defined(VECTORIZED_PREFIX_DOUBLING)
139 #define PREFIX_DOUBLING_VECTOR 4
140 #else
141 #define PREFIX_DOUBLING_VECTOR 1
142 #endif
143 __global__ void prefix_doubling_kernel(
144  const uint32 n_slots,
145  const uint32 n_suffixes,
146  const uint32 j,
147  const uint32* suffixes,
148  const uint32* inv_keys,
149  uint32* out_keys)
150 {
151  //
152  // perform prefix-doubling:
153  //
154  // - suffixes[i] encodes the suffix at index i;
155  //
156  // - in order to perform prefix-doubling, we want to set K[i] as
157  // the position of suffix SA[i] + j, i.e. K[i] = invSA[ SA[i] + j ]
158  //
159  const cuda::ldg_pointer<uint32> inv_keys_ldg( inv_keys );
160 
161  #if !defined(VECTORIZED_PREFIX_DOUBLING) // reference scalar implementation
162  const uint32 idx = threadIdx.x + blockIdx.x * blockDim.x;
163  if (idx >= n_slots)
164  return;
165 
166  const uint32 suf_nj = suffixes[idx] + j;
167  const uint32 K_nj = suf_nj < n_suffixes ? inv_keys_ldg[ suf_nj ] : 0u;
168  out_keys[idx] = K_nj;
169  #else // vectorized implementation
170  const uint32 idx = threadIdx.x + blockIdx.x * blockDim.x;
171  if (idx*4u >= n_slots)
172  return;
173 
174  uint4 suf_nj = ((const uint4*)suffixes)[idx];
175  suf_nj = make_uint4( suf_nj.x + j, suf_nj.y + j, suf_nj.z + j, suf_nj.w + j );
176 
177  const uint4 K_nj = make_uint4(
178  suf_nj.x < n_suffixes ? inv_keys_ldg[ suf_nj.x ] : 0u,
179  suf_nj.y < n_suffixes ? inv_keys_ldg[ suf_nj.y ] : 0u,
180  suf_nj.z < n_suffixes ? inv_keys_ldg[ suf_nj.z ] : 0u,
181  suf_nj.w < n_suffixes ? inv_keys_ldg[ suf_nj.w ] : 0u );
182 
183  ((uint4*)out_keys)[idx] = K_nj;
184  #endif
185 }
188 inline void prefix_doubling(
189  const uint32 n_slots,
190  const uint32 n_suffixes,
191  const uint32 j,
192  const uint32* suffixes,
193  const uint32* inv_keys,
194  uint32* out_keys)
195 {
196  const uint32 blockdim = 256;
197  const uint32 n_blocks = ((n_slots+PREFIX_DOUBLING_VECTOR-1)/PREFIX_DOUBLING_VECTOR + blockdim-1) / blockdim;
198  prefix_doubling_kernel<<<n_blocks,blockdim>>>(
199  n_slots,
200  n_suffixes,
201  j,
202  suffixes,
203  inv_keys,
204  out_keys );
205 }
206 
209 template <uint32 BLOCKDIM>
210 __global__ void build_head_flags_kernel(
211  const uint32 n_flags,
212  const uint32* keys,
213  uint8* flags,
214  uint32* blocks)
215 {
216  const uint32 thread_id = (threadIdx.x + blockIdx.x * blockDim.x);
217  const uint32 idx = thread_id * 4;
218 
219  // load the previous key
220  const uint32 key_p = idx && idx < n_flags ? keys[idx-1] : 0xFFFFFFFF;
221 
222  // load the next 4 keys
223  const uint4 key = idx < n_flags ? ((const uint4*)keys)[thread_id] : make_uint4( 0, 0, 0, 0 );
224 
225  // fetch 4 flags in one go
226  uchar4 flag = idx < n_flags ? ((const uchar4*)flags)[thread_id] : make_uchar4( 0, 0, 0, 0 );
227 
228  // merge them with the new flags
229  flag = make_uchar4(
230  (key.x != key_p) ? 1u : flag.x,
231  (key.y != key.x) ? 1u : flag.y,
232  (key.z != key.y) ? 1u : flag.z,
233  (key.w != key.z) ? 1u : flag.w );
234 
235  // clamp them
236  flag = make_uchar4(
237  idx + 0u < n_flags ? flag.x : 0u,
238  idx + 1u < n_flags ? flag.y : 0u,
239  idx + 2u < n_flags ? flag.z : 0u,
240  idx + 3u < n_flags ? flag.w : 0u );
241 
242  // write them out
243  if (idx < n_flags)
244  ((uchar4*)flags)[thread_id] = flag;
245 
246  // and compute their block sum
247  const uint32 n = flag.x + flag.y + flag.z + flag.w;
248  typedef cub::BlockReduce<uint32,BLOCKDIM> BlockReduce;
249 
250  __shared__ typename BlockReduce::TempStorage reduce_smem_storage;
251 
252  const uint32 block_aggregate = BlockReduce( reduce_smem_storage ).Sum( n );
253 
254  if (threadIdx.x == 0)
255  blocks[blockIdx.x] = block_aggregate;
256 }
257 
260 template <uint32 BLOCKDIM>
261 __global__ void extract_segments_kernel(
262  const uint32 n_flags,
263  const uint8* flags,
264  const uint32* blocks,
265  const uint32* slots,
266  uint32* keys,
267  uint32* segments)
268 {
269  const uint32 thread_id = (threadIdx.x + blockIdx.x * blockDim.x);
270  const uint32 idx = thread_id * 4;
271 
272  // fetch 4 flags in one go
273  const uchar4 in_flag = idx < n_flags ? ((const uchar4*)flags)[thread_id] : make_uchar4( 0, 0, 0, 0 );
274 
275  const uchar4 flag = make_uchar4(
276  idx + 0u < n_flags ? in_flag.x : 0u,
277  idx + 1u < n_flags ? in_flag.y : 0u,
278  idx + 2u < n_flags ? in_flag.z : 0u,
279  idx + 3u < n_flags ? in_flag.w : 0u );
280 
281  // compute their sum
282  const uint32 n = flag.x + flag.y + flag.z + flag.w;
283 
284  uint32 partial_scan;
285  uint32 partial_aggregate;
286 
287  typedef cub::BlockScan<uint32,BLOCKDIM> BlockScan;
288 
289  __shared__ typename BlockScan::TempStorage scan_smem_storage;
290 
291  BlockScan( scan_smem_storage ).ExclusiveSum( n, partial_scan, partial_aggregate );
292 
293  // add the block partial
294  partial_scan += blocks[ blockIdx.x ];
295 
296  // perform the serial scan to get the segment key
297  const uint4 key = make_uint4(
298  partial_scan + uint32(flag.x),
299  partial_scan + uint32(flag.x + flag.y),
300  partial_scan + uint32(flag.x + flag.y + flag.z),
301  partial_scan + uint32(flag.x + flag.y + flag.z + flag.w) );
302 
303  // write the output key
304  if (idx < n_flags)
305  ((uint4*)keys)[thread_id] = key;
306 
307  if (n)
308  {
309  // write the output segment head
310  const uint4 slot = ((const uint4*)slots)[thread_id];
311 
312  if (flag.x) segments[key.x] = slot.x + 1u;
313  if (flag.y) segments[key.y] = slot.y + 1u;
314  if (flag.z) segments[key.z] = slot.z + 1u;
315  if (flag.w) segments[key.w] = slot.w + 1u;
316  }
317 }
318 
331  const uint32 n_flags,
332  const uint32* in_keys,
333  uint8* flags,
334  uint32* blocks,
335  const uint32* slots,
336  uint32* keys,
337  uint32* segments)
338 {
339  const uint32 blockdim = 128;
340  const uint32 n_blocks = ((n_flags+3)/4 + blockdim-1) / blockdim;
341 
342  // update the head flags and compute their block sums
343  build_head_flags_kernel<blockdim> <<<n_blocks,blockdim>>>(
344  n_flags,
345  in_keys,
346  flags,
347  blocks );
348 
349  *thrust::device_ptr<uint8>( flags + n_flags ) = 1u; // add a sentinel
350 
351  // scan the block partials
352  *thrust::device_ptr<uint32>( blocks + n_blocks ) = 0u;
354  thrust::device_ptr<uint32>( blocks ),
355  thrust::device_ptr<uint32>( blocks ) + n_blocks + 1u,
356  thrust::device_ptr<uint32>( blocks ),
357  0u,
358  thrust::plus<uint32>() );
359 
360  // and extract the new keys and segments
361  extract_segments_kernel<blockdim> <<<n_blocks,blockdim>>>(
362  n_flags,
363  flags,
364  blocks,
365  slots,
366  keys,
367  segments );
368 
369  return *thrust::device_ptr<uint32>( blocks + n_blocks );
370 }
371 
372 
375 __global__ void compact_kernel(
376  const uint32 n,
377  const uint8* stencil,
378  const uint32* keys,
379  const uint8* flags,
380  const uint32* slots,
381  const uint32* indices,
382  uint8* out_flags,
383  uint32* out_slots,
384  uint32* out_indices)
385 {
386  const uint32 thread_id = (threadIdx.x + blockIdx.x * blockDim.x);
387  const uint32 idx = 4 * thread_id;
388  if (idx >= n)
389  return;
390 
391  const uchar4 s = ((const uchar4*)stencil)[thread_id];
392 
393  if (s.x + s.y + s.z + s.w == 0)
394  return;
395 
396  const uint4 key = ((const uint4*)keys)[thread_id];
397  const uchar4 flag = ((const uchar4*)flags)[thread_id];
398  const uint4 slot = ((const uint4*)slots)[thread_id];
399  const uint4 index = ((const uint4*)indices)[thread_id];
400 
401  if (s.x)
402  {
403  out_flags[ key.x - 1 ] = flag.x;
404  out_slots[ key.x - 1 ] = slot.x;
405  out_indices[ key.x - 1 ] = index.x;
406  }
407  if (s.y)
408  {
409  out_flags[ key.y - 1 ] = flag.y;
410  out_slots[ key.y - 1 ] = slot.y;
411  out_indices[ key.y - 1 ] = index.y;
412  }
413  if (s.z)
414  {
415  out_flags[ key.z - 1 ] = flag.z;
416  out_slots[ key.z - 1 ] = slot.z;
417  out_indices[ key.z - 1 ] = index.z;
418  }
419  if (s.w)
420  {
421  out_flags[ key.w - 1 ] = flag.w;
422  out_slots[ key.w - 1 ] = slot.w;
423  out_indices[ key.w - 1 ] = index.w;
424  }
425 }
426 
429 inline void compact(
430  const uint32 n,
431  const uint8* stencil,
432  const uint32* keys,
433  const uint8* flags,
434  const uint32* slots,
435  const uint32* indices,
436  uint8* out_flags,
437  uint32* out_slots,
438  uint32* out_indices)
439 {
440  const uint32 blockdim = 128;
441  const uint32 n_words = (n + 3) / 4;
442  const uint32 n_blocks = (n_words + blockdim-1) / blockdim;
443 
444  compact_kernel<<<n_blocks,blockdim>>>(
445  n,
446  stencil,
447  keys,
448  flags,
449  slots,
450  indices,
451  out_flags,
452  out_slots,
453  out_indices );
454 }
455 
463 template <typename string_type, typename output_iterator>
465  const typename stream_traits<string_type>::index_type string_len,
466  const string_type string,
467  output_iterator d_suffixes)
468 {
469  typedef typename stream_traits<string_type>::index_type index_type;
471 
472  typedef uint32 word_type;
473  NVBIO_VAR_UNUSED const uint32 WORD_BITS = uint32( 8u * sizeof(word_type) );
474  NVBIO_VAR_UNUSED const uint32 DOLLAR_BITS =
475  SYMBOL_SIZE*4 <= WORD_BITS-4 ? 4 : // more than three symbols per word
476  SYMBOL_SIZE*3 <= WORD_BITS-2 ? 2 : // up to three symbols per word
477  SYMBOL_SIZE*2 <= WORD_BITS-2 ? 2 : // up to two symbols per word
478  0; // only one symbol per word
479 
480  const uint32 SYMBOLS_PER_WORD = priv::symbols_per_word<SYMBOL_SIZE,WORD_BITS,DOLLAR_BITS>();
481 
482  const uint32 n_suffixes = string_len;
483 
484  // reserve temporary storage
485  reserve( n_suffixes );
486 
487  // initialize the segment flags
488  thrust::fill(
489  d_segment_flags.begin(),
490  d_segment_flags.begin() + n_suffixes,
491  uint8(0u) );
492 
493  // initialize the device sorting indices
494  thrust::copy(
495  thrust::make_counting_iterator<uint32>(0u),
496  thrust::make_counting_iterator<uint32>(0u) + n_suffixes,
497  d_sort_indices.begin() );
498 
499  // initialize the active slots
500  thrust::copy(
501  thrust::make_counting_iterator<uint32>(0u),
502  thrust::make_counting_iterator<uint32>(0u) + n_suffixes,
503  d_active_slots.begin() );
504 
505  // first pass: sort all suffixes by the first radix
506  NVBIO_CUDA_DEBUG_STATEMENT( fprintf(stderr," primary key sorting\n") );
507  {
508  Timer timer;
509  timer.start();
510 
511  // extract the given radix word from each of the partially sorted suffixes and merge it with the existing keys
513 
514  thrust::copy(
515  thrust::make_transform_iterator( d_sort_indices.begin(), word_functor ),
516  thrust::make_transform_iterator( d_sort_indices.begin() + n_suffixes, word_functor ),
517  d_sort_keys.begin() );
518 
519  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
520  timer.stop();
521  extract_time += timer.seconds();
522 
523  timer.start();
524 
526  cuda::SortEnactor sort_enactor;
527 
528  sort_buffers.selector = 0;
529  sort_buffers.keys[0] = nvbio::raw_pointer( d_sort_keys );
530  sort_buffers.keys[1] = nvbio::raw_pointer( d_segment_keys );
531  sort_buffers.values[0] = nvbio::raw_pointer( d_sort_indices );
532  sort_buffers.values[1] = nvbio::raw_pointer( d_segment_heads );
533 
534  // sort the keys together with the indices
535  sort_enactor.sort( n_suffixes, sort_buffers );
536 
537  if (sort_buffers.selector)
538  {
539  // swap the buffers
540  d_sort_keys.swap( d_segment_keys );
541  d_sort_indices.swap( d_segment_heads );
542  }
543 
544  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
545  timer.stop();
546  radixsort_time += timer.seconds();
547  }
548 
549  NVBIO_CUDA_DEBUG_STATEMENT( fprintf(stderr," primary key compression\n") );
550  {
551  Timer timer;
552  timer.start();
553 
554  // find out consecutive items with equal keys
555  // and take the minimum of their positions over the newly defined key sets, i.e:
556  // pos : 1 2 3 4 5 6 7 8 9
557  // keys : (1, 1, 2, 3, 3, 3, 4, 4, 4) ->
558  // out : (1, 1, 3, 4, 4, 4, 7, 7, 7)
559  NVBIO_VAR_UNUSED const uint32 n_segments = extract_segments(
560  n_suffixes,
561  nvbio::raw_pointer( d_sort_keys ),
562  nvbio::raw_pointer( d_segment_flags ),
563  nvbio::raw_pointer( d_segment_blocks ),
564  nvbio::raw_pointer( d_active_slots ),
565  nvbio::raw_pointer( d_segment_keys ),
566  nvbio::raw_pointer( d_segment_heads ) );
567 
568  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
569  timer.stop();
570  segment_time += timer.seconds();
571 
572  timer.start();
573 
574  // now scatter the new keys to obtain the inverse keys
575  thrust::scatter(
576  thrust::make_permutation_iterator( d_segment_heads.begin(), d_segment_keys.begin() ),
577  thrust::make_permutation_iterator( d_segment_heads.begin(), d_segment_keys.begin() ) + n_suffixes,
578  d_sort_indices.begin(),
579  d_inv_keys.begin() );
580 
581  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
582  timer.stop();
583  inverse_time += timer.seconds();
584  }
585 
586  // keep track of the number of active suffixes
587  uint32 n_active_suffixes = n_suffixes;
588 
589  for (uint32 j = SYMBOLS_PER_WORD; j < string_len; j *= 2u)
590  {
591  #define CSS_KEY_PRUNING
592  #if defined(CSS_KEY_PRUNING)
593  //
594  // extract all the strings from segments with more than 1 element: in order to do this we want to mark
595  // all the elements that need to be copied. These are exactly all the 1's followed by a 0, and all the
596  // 0's. Conversely, all the 1's followed by a 1 don't need to copied.
597  // i.e. we want to transform the flags:
598  // (1,1,0,0,0,1,1,1,0,1) in the vector:
599  // (0,1,1,1,1,0,0,1,1,0)
600  // This can be done again looking at neighbours, this time with a custom binary-predicate implementing
601  // the above logic.
602  // As thrust::adjacent_difference is right-aligned, we'll obtain the following situation instead:
603  // segment-flags: (1,1,0,0,0,1,1,1,0,1,[1]) (where [1] is a manually inserted sentinel value)
604  // segment-keys:(1,0,1,1,1,1,0,0,1,1,0,[.])
605  // where we have added a sentinel value to the first vector, and will find our results starting
606  // at position one in the output
607  //
608  thrust::adjacent_difference(
609  d_segment_flags.begin(),
610  d_segment_flags.begin() + n_active_suffixes+1u,
611  d_partial_flags.begin() + 3u,
613 
614  // and scan them to get their candidate output positions
616  d_partial_flags.begin() + 4u,
617  d_partial_flags.begin() + 4u + n_active_suffixes,
618  d_segment_keys.begin(),
619  thrust::plus<uint32>() );
620 
621  const uint32 n_partials = d_segment_keys[ n_active_suffixes-1u ];
622 
623  //NVBIO_CUDA_DEBUG_STATEMENT( fprintf(stderr," partial: %.3f%%\n", 100.0f*float(n_partials)/float(n_suffixes) ) );
624 
625  // check if the number of "unique" keys is small enough to justify reducing the active set
626  //if (2u*n_segments >= n_active_suffixes)
627  if (3u*n_partials <= n_active_suffixes*2u)
628  {
629  //NVBIO_CUDA_DEBUG_STATEMENT( fprintf(stderr,"\n segments: %.3f%%, at pass %u\n", 100.0f*float(n_segments)/float(n_suffixes), word_idx) );
630 
631  Timer timer;
632  timer.start();
633 
634  // before shrinking the set of active suffixes scatter the already sorted indices to the output
635  // in their proper place
636  thrust::scatter_if(
637  d_sort_indices.begin(), // input begin
638  d_sort_indices.begin() + n_active_suffixes, // input end
639  d_active_slots.begin(), // map
640  thrust::make_transform_iterator( d_partial_flags.begin() + 4u, is_false_functor<uint32>() ),// stencil
641  d_suffixes ); // output
642 
643  // now keep only the slots we are interested in
644  compact(
645  n_active_suffixes,
646  nvbio::raw_pointer( d_partial_flags ) + 4u,
647  nvbio::raw_pointer( d_segment_keys ),
648  nvbio::raw_pointer( d_segment_flags ),
649  nvbio::raw_pointer( d_active_slots ),
650  nvbio::raw_pointer( d_sort_indices ),
651  nvbio::raw_pointer( d_temp_flags ),
652  nvbio::raw_pointer( d_sort_keys ),
653  nvbio::raw_pointer( d_segment_heads ) );
654 
655  // and swap the buffers
656  d_segment_flags.swap( d_temp_flags );
657  d_active_slots.swap( d_sort_keys );
658  d_sort_indices.swap( d_segment_heads );
659 
660  // overwrite the number of active suffixes
661  n_active_suffixes = n_partials;
662 
663  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
664  timer.stop();
665  compact_time += timer.seconds();
666 
667  NVBIO_CUDA_DEBUG_STATEMENT( fprintf(stderr," active: %.3f%%\n", 100.0f*float(n_active_suffixes)/float(n_suffixes) ) );
668  }
669  #endif // if defined(CSS_KEY_PRUNING)
670 
671  NVBIO_CUDA_DEBUG_STATEMENT( fprintf(stderr," secondary key sorting [%u]\n", j) );
672  Timer timer;
673  timer.start();
674 
675  // build the new sorting keys
676  //
677  // - d_keys encodes an array K*, such that K*[i] = K[SA[i]] is the key currently bound to suffix i;
678  //
679  // - in order to perform prefix-doubling, we want to replace K[i] with K[i+j],
680  // but we have to modify K* instead, because that's all we have got;
681  //
682  // - so we have to replace K*[n] with:
683  // K[SA[n]+j] = K*[invSA[SA[n]+j]]
684  //
686  n_active_suffixes,
687  n_suffixes,
688  j,
689  nvbio::raw_pointer( d_sort_indices ),
690  nvbio::raw_pointer( d_inv_keys ),
691  nvbio::raw_pointer( d_sort_keys ) );
692 
693  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
694  timer.stop();
695  gather_time += timer.seconds();
696 
697  timer.start();
698 
699  uint32* d_comp_flags = (uint32*)nvbio::raw_pointer( d_temp_flags );
700 
701  // build the compressed flags
703  n_active_suffixes,
704  nvbio::raw_pointer( d_segment_flags ),
705  d_comp_flags );
706 
707  // sort within segments
708  mgpu::SegSortPairsFromFlags(
709  nvbio::raw_pointer( d_sort_keys ),
710  nvbio::raw_pointer( d_sort_indices ),
711  n_active_suffixes,
712  d_comp_flags,
713  *m_mgpu );
714 
715  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
716  timer.stop();
717  radixsort_time += timer.seconds();
718 
719  timer.start();
720 
721  // find out consecutive items with equal keys, merging them with the parent's
722  // and take the minimum of their positions over the newly defined key sets, i.e:
723  // pos : 1 2 3 4 5 6 7 8 9
724  // keys : (1, 1, 2, 3, 3, 3, 4, 4, 4) ->
725  // out : (1, 1, 3, 4, 4, 4, 7, 7, 7)
726  const uint32 n_segments = extract_segments(
727  n_active_suffixes,
728  nvbio::raw_pointer( d_sort_keys ),
729  nvbio::raw_pointer( d_segment_flags ),
730  nvbio::raw_pointer( d_segment_blocks ),
731  nvbio::raw_pointer( d_active_slots ),
732  nvbio::raw_pointer( d_segment_keys ),
733  nvbio::raw_pointer( d_segment_heads ) );
734 
735  if (n_segments == n_active_suffixes) // we are done!
736  {
737  // scatter the sorted indices to the output in their proper place
738  thrust::scatter(
739  d_sort_indices.begin(),
740  d_sort_indices.begin() + n_active_suffixes,
741  d_active_slots.begin(),
742  d_suffixes );
743 
744  break; // bail out of the sorting loop
745  }
746 
747  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
748  timer.stop();
749  segment_time += timer.seconds();
750 
751  timer.start();
752 
753  // now scatter the new keys to obtain the inverse keys
754  thrust::scatter(
755  thrust::make_permutation_iterator( d_segment_heads.begin(), d_segment_keys.begin() ),
756  thrust::make_permutation_iterator( d_segment_heads.begin(), d_segment_keys.begin() ) + n_active_suffixes,
757  d_sort_indices.begin(),
758  d_inv_keys.begin() );
759 
760  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
761  timer.stop();
762  inverse_time += timer.seconds();
763  }
764 }
765 
766 } // namespace cuda
767 } // namespace nvbio