NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
compression_sort.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
33 #include <nvbio/basic/cuda/sort.h>
34 #include <thrust/device_vector.h>
35 #include <thrust/transform_scan.h>
36 #include <thrust/binary_search.h>
37 #include <thrust/iterator/constant_iterator.h>
38 #include <thrust/iterator/counting_iterator.h>
39 #include <thrust/sort.h>
40 #include <mgpuhost.cuh>
41 #include <moderngpu.cuh>
42 
43 namespace nvbio {
44 namespace cuda {
45 
48 
55 
58 
62 template <typename OutputIterator>
63 struct DelayList
64 {
66  OutputIterator _indices,
67  OutputIterator _slots) :
68  count( 0 ),
69  offset( 0 ),
70  indices( _indices ),
71  slots( _slots ) {}
72 
75  void clear() { count = 0; }
76 
79  void set_offset(const uint32 _offset) { offset = _offset; }
80 
83  template <typename InputIterator>
84  void push_back(
85  const uint32 in_count,
86  InputIterator in_indices,
87  InputIterator in_slots)
88  {
90  in_indices,
91  in_indices + in_count,
92  indices + count );
93 
95  in_slots,
96  in_slots + in_count,
97  slots + count,
99 
100  count += in_count;
101  }
102 
105  OutputIterator indices;
106  OutputIterator slots;
107 };
108 
113 {
114  template <typename InputIterator>
115  void push_back(
116  const uint32 n_delayed,
117  InputIterator delay_indices,
118  InputIterator delay_slots)
119  {}
120 };
121 
132 {
136  extract_time(0.0f),
137  radixsort_time(0.0f),
138  stablesort_time(0.0f),
139  compress_time(0.0f),
140  compact_time(0.0f),
141  copy_time(0.0f),
142  scatter_time(0.0f),
143  m_mgpu( _mgpu ) {}
144 
164  template <typename string_type, typename output_iterator, typename delay_list_type>
165  void sort(
166  const typename string_type::index_type string_len,
167  const string_type string,
168  const uint32 n_suffixes,
169  output_iterator d_suffixes,
170  const uint32 delay_min_threshold,
171  const uint32 delay_max_threshold,
172  delay_list_type& delay_list);
173 
207  template <typename set_type, typename input_iterator, typename output_iterator, typename delay_list_type>
208  void sort(
209  set_type set,
210  const uint32 n_strings,
211  const uint32 max_words,
212  input_iterator d_input,
213  output_iterator d_output,
214  const uint32 delay_threshold,
215  delay_list_type& delay_list,
216  const uint32 slice_size = 8u);
217 
220  void reserve(const uint32 n)
221  {
222  try
223  {
224  priv::alloc_storage( d_temp_indices, n+4 );
225  priv::alloc_storage( d_indices, n+4 );
226  priv::alloc_storage( d_keys, n+4 );
227  priv::alloc_storage( d_active_slots, n+4 );
228  priv::alloc_storage( d_segment_flags, n+32 );
229  priv::alloc_storage( d_copy_flags, n+32 );
230  priv::alloc_storage( d_temp_flags, n+32 );
231  }
232  catch (...)
233  {
234  fprintf(stderr, "CompressionSort::reserve() : exception caught!\n");
235  throw;
236  }
237  }
238 
242  {
243  return
244  (n+4) * sizeof(uint32) +
245  (n+4) * sizeof(uint32) +
246  (n+4) * sizeof(uint32) +
247  (n+4) * sizeof(uint32) +
248  (n+32) * sizeof(uint8) +
249  (n+32) * sizeof(uint8) +
250  (n+32) * sizeof(uint8);
251  }
252 
256  {
257  return
258  d_temp_storage.size() * sizeof(uint8) +
259  d_temp_indices.size() * sizeof(uint32) +
260  d_indices.size() * sizeof(uint32) +
261  d_keys.size() * sizeof(uint32) +
262  d_active_slots.size() * sizeof(uint32) +
263  d_segment_flags.size() * sizeof(uint8) +
264  d_copy_flags.size() * sizeof(uint8) +
265  d_temp_flags.size() * sizeof(uint8);
266  }
267 
268  float extract_time;
272  float compact_time;
273  float copy_time;
274  float scatter_time;
275 
276 private:
277  thrust::device_vector<uint8> d_temp_storage;
278  thrust::device_vector<uint32> d_temp_indices;
279  thrust::device_vector<uint32> d_indices;
280  thrust::device_vector<uint32> d_keys;
281  thrust::device_vector<uint32> d_active_slots;
282  thrust::device_vector<uint8> d_segment_flags;
283  thrust::device_vector<uint8> d_copy_flags;
284  thrust::device_vector<uint8> d_temp_flags;
285  mgpu::ContextPtr m_mgpu;
286 };
287 
290 
291 
292 // Sort a given batch of suffixes using Iterative Compression Sorting - an algorithm inspired by
293 // Tero Karras and Timo Aila's "Flexible Parallel Sorting through Iterative Key Compression".
294 //
295 // \param string_len string length
296 // \param string string iterator
297 // \param n_suffixes number of suffixes to sort
298 // \param d_suffixes device vector of input/output suffixes
299 //
300 template <typename string_type, typename output_iterator, typename delay_list_type>
302  const typename string_type::index_type string_len,
303  const string_type string,
304  const uint32 n_suffixes,
305  output_iterator d_suffixes,
306  const uint32 delay_min_threshold,
307  const uint32 delay_max_threshold,
308  delay_list_type& delay_list)
309 {
310  typedef typename string_type::index_type index_type;
311  NVBIO_VAR_UNUSED const uint32 SYMBOL_SIZE = string_type::SYMBOL_SIZE;
312 
313  typedef uint32 word_type;
314  NVBIO_VAR_UNUSED const uint32 WORD_BITS = uint32( 8u * sizeof(uint32) );
315  NVBIO_VAR_UNUSED const uint32 DOLLAR_BITS = 4;
316 
317  try
318  {
319  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " initialize\n" ) );
320 
321  // reserve temporary storage
322  reserve( n_suffixes );
323 
324  // initialize the device sorting indices
325  thrust::copy(
326  d_suffixes,
327  d_suffixes + n_suffixes,
328  d_indices.begin() );
329 
330  // initialize the device sorting keys
331  thrust::copy(
332  thrust::make_constant_iterator<uint32>(0u),
333  thrust::make_constant_iterator<uint32>(0u) + n_suffixes,
334  d_keys.begin() );
335 
336  // initialize the active slots
337  thrust::copy(
338  thrust::make_counting_iterator<uint32>(0u),
339  thrust::make_counting_iterator<uint32>(0u) + n_suffixes,
340  d_active_slots.begin() );
341 
342  // initialize the segment flags
343  d_segment_flags[0] = 1u;
344  thrust::fill(
345  d_segment_flags.begin() + 1u,
346  d_segment_flags.begin() + n_suffixes,
347  uint8(0) );
348 
349  // keep track of the number of active suffixes
350  uint32 n_active_suffixes = n_suffixes;
351 
352  //
353  // do what is essentially an MSB radix-sort on the suffixes, word by word, using iterative key
354  // compression:
355  // the idea is that at each step we sort the current, say, 64-bit keys, and then "rewrite" them
356  // so as to reduce their entropy to the minimum (e.g. the vector (131, 542, 542, 7184, 8192, 8192)
357  // will become (0, 1, 1, 2, 3, 3)).
358  // At that point, we fetch a new 32-bit radix from the strings and append it to each key, shifting
359  // the old value to the high 32-bits and merging the new radix in the lowest 32.
360  // And repeat, until we find out that all keys have a unique value.
361  // This algorithm is a derivative of Tero Karras and Timo Aila's "Flexible Parallel Sorting through
362  // Iterative Key Compression".
363  //
364  for (uint32 word_idx = 0; true; ++word_idx)
365  {
366  if ((word_idx >= delay_min_threshold && 1000 * n_active_suffixes <= n_suffixes) ||
367  (word_idx >= delay_max_threshold))
368  {
369  delay_list.push_back(
370  n_active_suffixes,
371  d_indices.begin(),
372  d_active_slots.begin() );
373 
374  break; // bail out of the sorting loop
375  }
376 
377  Timer timer;
378  timer.start();
379 
380  // extract the given radix word from each of the partially sorted suffixes and merge it with the existing keys
382 
383  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " extract\n" ) );
385  d_indices.begin(),
386  d_indices.begin() + n_active_suffixes,
387  d_keys.begin(),
388  word_functor );
389 
390  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
391  cuda::check_error("CompressionSort::sort() : extract");
392 
393  timer.stop();
394  extract_time += timer.seconds();
395 
396  timer.start();
397 
398  // build the compressed flags
399  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " pack-flags\n" ) );
400  uint32* d_comp_flags = (uint32*)nvbio::raw_pointer( d_temp_flags );
402  n_active_suffixes,
403  nvbio::raw_pointer( d_segment_flags ),
404  d_comp_flags );
405 
406  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
407  cuda::check_error("CompressionSort::sort() : pack_flags");
408 
409  // sort within segments
410  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " seg-sort\n" ) );
411  mgpu::SegSortPairsFromFlags(
412  nvbio::raw_pointer( d_keys ),
413  nvbio::raw_pointer( d_indices ),
414  n_active_suffixes,
415  d_comp_flags,
416  *m_mgpu );
417 
418  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
419  cuda::check_error("CompressionSort::sort() : seg_sort");
420 
421  timer.stop();
422  radixsort_time += timer.seconds();
423 
424  timer.start();
425 
426  // find out consecutive items with equal keys
427  //
428  // We can easily compute the head flags for a set of "segments" of equal keys, just by comparing each
429  // of them in the sorted list with its predecessor.
430  // At that point, we can isolate all segments which contain more than 1 suffix and continue sorting
431  // those by themselves.
432  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " build-head-flags\n" ) );
434  n_active_suffixes,
435  nvbio::raw_pointer( d_keys ),
436  nvbio::raw_pointer( d_segment_flags ) );
437 
438  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
439  cuda::check_error("CompressionSort::sort() : build_head_flags");
440 
441  d_segment_flags[0] = 1u; // make sure the first flag is a 1
442  d_segment_flags[n_active_suffixes] = 1u; // and add a sentinel
443 
444  // perform a scan to "compress" the keys in place, removing holes between them and reducing their entropy;
445  // this operation will produce a 1-based vector of contiguous values of the kind (1, 1, 2, 3, 3, 3, ... )
447  n_active_suffixes,
448  thrust::make_transform_iterator( d_segment_flags.begin(), cast_functor<uint8,uint32>() ),
449  d_keys.begin(),
450  thrust::plus<uint32>(),
451  d_temp_storage );
452 
453  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
454  timer.stop();
455  compress_time += timer.seconds();
456 
457  const uint32 n_segments = d_keys[ n_active_suffixes - 1u ];
458  //NVBIO_CUDA_DEBUG_STATEMENT( fprintf(stderr,"\n segments: %u/%u, at pass %u\n", n_segments, n_active_suffixes, word_idx) );
459 
460  if (n_segments == n_active_suffixes) // we are done!
461  {
462  // scatter the partially sorted indices to the output in their proper place
463  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " scatter-partials\n" ) );
464  thrust::scatter(
465  d_indices.begin(),
466  d_indices.begin() + n_active_suffixes,
467  d_active_slots.begin(),
468  d_suffixes );
469 
470  break; // bail out of the sorting loop
471  }
472 
473  #define KEY_PRUNING
474  #if defined(KEY_PRUNING)
475  //
476  // extract all the strings from segments with more than 1 element: in order to do this we want to mark
477  // all the elements that need to be copied. These are exactly all the 1's followed by a 0, and all the
478  // 0's. Conversely, all the 1's followed by a 1 don't need to copied.
479  // i.e. we want to transform the flags:
480  // (1,1,0,0,0,1,1,1,0,1) in the vector:
481  // (0,1,1,1,1,0,0,1,1,0)
482  // This can be done again looking at neighbours, this time with a custom binary-predicate implementing
483  // the above logic.
484  // As thrust::adjacent_difference is right-aligned, we'll obtain the following situation instead:
485  // segment-flags: (1,1,0,0,0,1,1,1,0,1,[1]) (where [1] is a manually inserted sentinel value)
486  // copy-flags: (1,0,1,1,1,1,0,0,1,1,0,[.])
487  // where we have added a sentinel value to the first vector, and will find our results starting
488  // at position one in the output
489  //
490  thrust::adjacent_difference(
491  d_segment_flags.begin(),
492  d_segment_flags.begin() + n_active_suffixes+1u,
493  d_copy_flags.begin(),
495 
496  const uint32 n_partials = cuda::reduce(
497  n_active_suffixes,
498  thrust::make_transform_iterator( d_copy_flags.begin() + 1u, cast_functor<uint8,uint32>() ),
499  thrust::plus<uint32>(),
500  d_temp_storage );
501 
502  // check if the number of "unique" keys is small enough to justify reducing the active set
503  //if (2u*n_segments >= n_active_suffixes)
504  if (2u*n_partials <= n_active_suffixes)
505  {
506  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " filter-unique %u\n", n_partials ) );
507 
508  timer.start();
509 
510  // scatter the partially sorted indices to the output in their proper place
511  thrust::scatter(
512  d_indices.begin(),
513  d_indices.begin() + n_active_suffixes,
514  d_active_slots.begin(),
515  d_suffixes );
516 
517  thrust::device_vector<uint32>& d_temp_indices = d_keys;
518 
519  // now keep only the indices we are interested in
520  if (uint32 n_active = cuda::copy_flagged(
521  n_active_suffixes,
522  d_indices.begin(),
523  d_copy_flags.begin() + 1u,
524  d_temp_indices.begin(),
525  d_temp_storage ) != n_partials)
526  throw nvbio::runtime_error("mismatching number of partial indices %u != %u\n", n_active, n_partials);
527 
528  d_indices.swap( d_temp_indices );
529 
530  // as well as their slots
531  if (uint32 n_active = cuda::copy_flagged(
532  n_active_suffixes,
533  d_active_slots.begin(),
534  d_copy_flags.begin() + 1u,
535  d_temp_indices.begin(),
536  d_temp_storage ) != n_partials)
537  throw nvbio::runtime_error("mismatching number of partial slots %u != %u\n", n_active, n_partials);
538 
539  d_active_slots.swap( d_temp_indices );
540 
541  // and the segment flags
542  if (uint32 n_active = cuda::copy_flagged(
543  n_active_suffixes,
544  d_segment_flags.begin(),
545  d_copy_flags.begin() + 1u,
546  d_temp_flags.begin(),
547  d_temp_storage ) != n_partials)
548  throw nvbio::runtime_error("mismatching number of partial flags %u != %u\n", n_active, n_partials);
549 
550  d_segment_flags.swap( d_temp_flags );
551 
552  // overwrite the number of active suffixes
553  n_active_suffixes = n_partials;
554 
555  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
556  timer.stop();
557  compact_time += timer.seconds();
558 
559  //NVBIO_CUDA_DEBUG_STATEMENT( fprintf(stderr,"\n active: %.3f%% in %.3f%% segments, at pass %u\n", 100.0f*float(n_active_suffixes)/float(n_suffixes), 100.0f*float(n_segments)/float(n_suffixes), word_idx) );
560  }
561  #endif // if defined(KEY_PRUNING)
562  }
563  }
564  catch (cuda_error& error)
565  {
566  log_error(stderr, "CompressionSort::sort() : cuda_error caught!\n %s\n", error.what());
567  throw error;
568  }
569  catch (...)
570  {
571  log_error(stderr, "CompressionSort::sort() : exception caught!\n");
572  throw;
573  }
574 }
575 
576 // Sort a given batch of strings using Iterative Compression Sorting - an algorithm inspired by
577 // Tero Karras and Timo Aila's "Flexible Parallel Sorting through Iterative Key Compression".
578 //
579 // \param set the set of items to sort
580 // \param n_strings the number of items to sort
581 // \param d_output device vector of output indices
582 //
583 // All the other parameters are temporary device buffers
584 //
585 template <typename set_type, typename input_iterator, typename output_iterator, typename delay_list_type>
587  set_type set,
588  const uint32 n_strings,
589  const uint32 max_words,
590  input_iterator d_input,
591  output_iterator d_output,
592  const uint32 delay_threshold,
593  delay_list_type& delay_list,
594  const uint32 slice_size)
595 {
596  //#define CULLED_EXTRACTION
597 
598  typedef uint32 index_type;
599 
600  try
601  {
602  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " initialize\n" ) );
603 
604  // reserve temporary storage
605  reserve( n_strings );
606 
607  // initialize the device sorting indices
608  thrust::copy(
609  d_input,
610  d_input + n_strings,
611  d_indices.begin() );
612 
613  // initialize the active slots
614  thrust::copy(
615  thrust::make_counting_iterator<uint32>(0u),
616  thrust::make_counting_iterator<uint32>(0u) + n_strings,
617  d_active_slots.begin() );
618 
619  // initialize the segment flags
620  d_segment_flags[0] = 1u;
621  thrust::fill(
622  d_segment_flags.begin() + 1u,
623  d_segment_flags.begin() + n_strings,
624  uint8(0) );
625 
626  // keep track of the number of active suffixes
627  uint32 n_active_strings = n_strings;
628 
629  //
630  // do what is essentially an MSB radix-sort on the suffixes, word by word, using iterative key
631  // compression:
632  // the idea is that at each step we sort the current, say, 64-bit keys, and then "rewrite" them
633  // so as to reduce their entropy to the minimum (e.g. the vector (131, 542, 542, 7184, 8192, 8192)
634  // will become (0, 1, 1, 2, 3, 3)).
635  // At that point, we fetch a new 32-bit radix from the strings and append it to each key, shifting
636  // the old value to the high 32-bits and merging the new radix in the lowest 32.
637  // And repeat, until we find out that all keys have a unique value.
638  // This algorithm is a derivative of Tero Karras and Timo Aila's "Flexible Parallel Sorting through
639  // Iterative Key Compression".
640  //
641  for (uint32 word_block_begin = 0; word_block_begin < max_words; word_block_begin += slice_size)
642  {
643  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " slice %u/%u (%u active)\n", word_block_begin, max_words, n_active_strings ) );
644  const uint32 word_block_end = nvbio::min( word_block_begin + slice_size, max_words );
645 
646  Timer timer;
647  timer.start();
648 
649  #if defined(CULLED_EXTRACTION)
650  // extract the given radix word from each of the partially sorted suffixes on the host
651  set.init_slice(
652  n_active_strings,
653  n_active_strings == n_strings ? (const uint32*)NULL : raw_pointer( d_indices ),
654  word_block_begin,
655  word_block_end );
656  #else
657  // extract the given radix word from all suffixes on the host
658  set.init_slice(
659  n_strings,
660  NULL,
661  word_block_begin,
662  word_block_end );
663  #endif
664 
665  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
666  timer.stop();
667  extract_time += timer.seconds();
668 
669  // do what is essentially an MSB sort on the suffixes
670  for (uint32 word_idx = word_block_begin; word_idx < word_block_end; ++word_idx)
671  {
672  if (word_idx > delay_threshold && 1000 * n_active_strings <= n_strings)
673  {
674  delay_list.push_back(
675  n_active_strings,
676  d_indices.begin(),
677  d_active_slots.begin() );
678 
679  return; // bail out of the sorting loop
680  }
681 
682  #if defined(CULLED_EXTRACTION)
683  timer.start();
684 
685  // extract only active radices, already sorted
686  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " extract\n" ) );
687  set.extract(
688  n_active_strings,
689  raw_pointer( d_indices ),
690  word_idx,
691  word_block_begin,
692  word_block_end,
693  raw_pointer( d_keys ) );
694 
695  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
696  timer.stop();
697  extract_time += timer.seconds();
698  #else
699  timer.start();
700 
701  // extract all radices, not sorted
702  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " extract\n" ) );
703  set.extract(
704  n_strings,
705  NULL,
706  word_idx,
707  word_block_begin,
708  word_block_end,
709  raw_pointer( d_temp_indices ) );
710 
711  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
712  timer.stop();
713  extract_time += timer.seconds();
714 
715  timer.start();
716 
717  // get the radices in proper order
718  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " gather\n" ) );
719  thrust::gather(
720  d_indices.begin(),
721  d_indices.begin() + n_active_strings,
722  d_temp_indices.begin(),
723  d_keys.begin() );
724 
725  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
726  timer.stop();
727  copy_time += timer.seconds();
728  #endif
729  timer.start();
730 
731  // build the compressed flags
732  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " pack-flags\n" ) );
733  uint32* d_comp_flags = (uint32*)nvbio::raw_pointer( d_temp_flags );
735  n_active_strings,
736  nvbio::raw_pointer( d_segment_flags ),
737  d_comp_flags );
738 
739  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
740  cuda::check_error("CompressionSort::sort() : pack_flags");
741 
742  // sort within segments
743  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " seg-sort\n" ) );
744  mgpu::SegSortPairsFromFlags(
745  nvbio::raw_pointer( d_keys ),
746  nvbio::raw_pointer( d_indices ),
747  n_active_strings,
748  d_comp_flags,
749  *m_mgpu );
750 
751  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
752  cuda::check_error("CompressionSort::sort() : seg_sort");
753 
754  timer.stop();
755  radixsort_time += timer.seconds();
756 
757  timer.start();
758 
759  // find out consecutive items with equal keys
760  //
761  // We can easily compute the head flags for a set of "segments" of equal keys, just by comparing each
762  // of them in the sorted list with its predecessor.
763  // At that point, we can isolate all segments which contain more than 1 suffix and continue sorting
764  // those by themselves.
765  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " build-head-flags\n" ) );
767  n_active_strings,
768  nvbio::raw_pointer( d_keys ),
769  nvbio::raw_pointer( d_segment_flags ) );
770 
771  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
772  cuda::check_error("CompressionSort::sort() : build_head_flags");
773 
774  d_segment_flags[0] = 1u; // make sure the first flag is a 1
775  d_segment_flags[n_active_strings] = 1u; // and add a sentinel
776 
777  // perform a scan to "compress" the keys in place, removing holes between them and reducing their entropy;
778  // this operation will produce a 1-based vector of contiguous values of the kind (1, 1, 2, 3, 3, 3, ... )
780  n_active_strings,
781  thrust::make_transform_iterator( d_segment_flags.begin(), cast_functor<uint8,uint32>() ),
782  d_keys.begin(),
783  thrust::plus<uint32>(),
784  d_temp_storage );
785 
786  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
787  timer.stop();
788  compress_time += timer.seconds();
789 
790  const uint32 n_segments = d_keys[ n_active_strings - 1u ];
791  //NVBIO_CUDA_DEBUG_STATEMENT( fprintf(stderr,"\n segments: %u/%u, at pass %u\n", n_segments, n_active_strings, word_idx) );
792 
793  if (n_segments == n_active_strings ||
794  word_idx+1 == max_words) // we are done!
795  {
796  timer.start();
797 
798  if (n_active_strings == n_strings)
799  {
800  // copy the fully sorted indices to the output in their proper place
801  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " copy-partials\n" ) );
802  thrust::copy(
803  d_indices.begin(),
804  d_indices.begin() + n_active_strings,
805  d_output );
806 
807  }
808  else
809  {
810  // scatter the partially sorted indices to the output in their proper place
811  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " scatter-partials\n" ) );
812  thrust::scatter(
813  d_indices.begin(),
814  d_indices.begin() + n_active_strings,
815  d_active_slots.begin(),
816  d_output );
817  }
818  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
819  timer.stop();
820  scatter_time += timer.seconds();
821  return; // bail out of the sorting loop
822  }
823  }
824 
825  if (word_block_end < max_words)
826  {
827  //
828  // extract all the strings from segments with more than 1 element: in order to do this we want to mark
829  // all the elements that need to be copied. These are exactly all the 1's followed by a 0, and all the
830  // 0's. Conversely, all the 1's followed by a 1 don't need to copied.
831  // i.e. we want to transform the flags:
832  // (1,1,0,0,0,1,1,1,0,1) in the vector:
833  // (0,1,1,1,1,0,0,1,1,0)
834  // This can be done again looking at neighbours, this time with a custom binary-predicate implementing
835  // the above logic.
836  // As thrust::adjacent_difference is right-aligned, we'll obtain the following situation instead:
837  // segment-flags: (1,1,0,0,0,1,1,1,0,1,[1]) (where [1] is a manually inserted sentinel value)
838  // copy-flags: (1,0,1,1,1,1,0,0,1,1,0,[.])
839  // where we have added a sentinel value to the first vector, and will find our results starting
840  // at position one in the output
841  //
842  thrust::adjacent_difference(
843  d_segment_flags.begin(),
844  d_segment_flags.begin() + n_active_strings+1u,
845  d_copy_flags.begin(),
847 
848  const uint32 n_partials = cuda::reduce(
849  n_active_strings,
850  thrust::make_transform_iterator( d_copy_flags.begin() + 1u, cast_functor<uint8,uint32>() ),
851  thrust::plus<uint32>(),
852  d_temp_storage );
853 
854  // check if the number of "unique" keys is small enough to justify reducing the active set
855  //if (2u*n_segments >= n_active_strings)
856  if (2u*n_partials <= n_active_strings)
857  {
858  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, " filter-unique %u\n", n_partials ) );
859 
860  timer.start();
861 
862  // scatter the partially sorted indices to the output in their proper place
863  thrust::scatter(
864  d_indices.begin(),
865  d_indices.begin() + n_active_strings,
866  d_active_slots.begin(),
867  d_output );
868 
869  // check whether we are done sorting
870  if (n_partials == 0)
871  {
872  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
873  timer.stop();
874  compact_time += timer.seconds();
875  return;
876  }
877 
878  thrust::device_vector<uint32>& d_temp_indices = d_keys;
879 
880  // now keep only the indices we are interested in
882  n_active_strings,
883  d_indices.begin(),
884  d_copy_flags.begin() + 1u,
885  d_temp_indices.begin(),
886  d_temp_storage );
887 
888  d_indices.swap( d_temp_indices );
889 
890  // as well as their slots
892  n_active_strings,
893  d_active_slots.begin(),
894  d_copy_flags.begin() + 1u,
895  d_temp_indices.begin(),
896  d_temp_storage );
897 
898  d_active_slots.swap( d_temp_indices );
899 
900  // and the segment flags
902  n_active_strings,
903  d_segment_flags.begin(),
904  d_copy_flags.begin() + 1u,
905  d_temp_flags.begin(),
906  d_temp_storage );
907 
908  d_segment_flags.swap( d_temp_flags );
909 
910  // overwrite the number of active suffixes
911  n_active_strings = n_partials;
912 
913  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
914  timer.stop();
915  compact_time += timer.seconds();
916  }
917  }
918  }
919  }
920  catch (cuda_error& error)
921  {
922  log_error(stderr, "CompressionSort::sort() : cuda_error caught!\n %s\n", error.what());
923  throw error;
924  }
925  catch (...)
926  {
927  log_error(stderr, "CompressionSort::sort() : exception caught!\n");
928  throw;
929  }
930 }
931 
932 } // namespace cuda
933 } // namespace nvbio