NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sufsort_priv.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <cub/cub.cuh>
31 #include <mgpuhost.cuh>
32 #include <moderngpu.cuh>
35 #include <nvbio/basic/cuda/sort.h>
36 #include <nvbio/basic/cuda/timer.h>
37 #include <nvbio/basic/cuda/ldg.h>
39 #include <thrust/host_vector.h>
40 #include <thrust/device_vector.h>
41 #include <thrust/adjacent_difference.h>
42 #include <thrust/binary_search.h>
43 #include <thrust/iterator/constant_iterator.h>
44 
45 #if defined(PLATFORM_X86)
46 #include <emmintrin.h> // SSE intrinsics
47 #endif
48 
49 namespace nvbio {
50 namespace priv {
51 
52 template <uint32 BITS> struct word_selector {};
53 template <> struct word_selector<4> { typedef uint8 type; };
54 template <> struct word_selector<6> { typedef uint8 type; };
55 template <> struct word_selector<8> { typedef uint8 type; };
56 template <> struct word_selector<10> { typedef uint8 type; };
57 template <> struct word_selector<12> { typedef uint16 type; };
58 template <> struct word_selector<14> { typedef uint16 type; };
59 template <> struct word_selector<16> { typedef uint16 type; };
60 template <> struct word_selector<18> { typedef uint16 type; };
61 template <> struct word_selector<20> { typedef uint32 type; };
62 template <> struct word_selector<22> { typedef uint32 type; };
63 template <> struct word_selector<24> { typedef uint32 type; };
64 template <> struct word_selector<26> { typedef uint32 type; };
65 template <> struct word_selector<28> { typedef uint32 type; };
66 template <> struct word_selector<30> { typedef uint32 type; };
67 template <> struct word_selector<32> { typedef uint32 type; };
68 template <> struct word_selector<48> { typedef uint64 type; };
69 template <> struct word_selector<64> { typedef uint64 type; };
70 
71 typedef ConcatenatedStringSet<
74 
75 typedef ConcatenatedStringSet<
78 
79 typedef ConcatenatedStringSet<
82 
83 typedef ConcatenatedStringSet<
86 
87 typedef ConcatenatedStringSet<
90 
97 
98 void extract_radices(
99  const priv::string_set_2bit_be string_set,
100  const uint32 n_suffixes,
101  const uint32 word_begin,
102  const uint32 word_end,
103  const uint32 word_bits,
104  const uint2* suffixes,
105  uint32* radices,
106  uint8* symbols = NULL);
107 
108 void extract_radices(
109  const priv::string_set_2bit_u64_be string_set,
110  const uint32 n_suffixes,
111  const uint32 word_begin,
112  const uint32 word_end,
113  const uint32 word_bits,
114  const uint2* suffixes,
115  uint64* radices,
116  uint8* symbols = NULL);
117 
118 // make sure a given buffer is big enough
119 //
120 template <typename VectorType>
121 void alloc_storage(VectorType& vec, const uint64 size)
122 {
123  if (vec.size() < size)
124  {
125  try
126  {
127  vec.clear();
128  vec.resize( size );
129  }
130  catch (...)
131  {
132  log_error(stderr,"alloc_storage() : allocation failed!\n");
133  throw;
134  }
135  }
136 }
137 
140 template <typename storage_type>
142 storage_type clearmask(const uint32 n) { return ~((storage_type(1u) << n)-1u); }
143 
147 {
149  typedef bool result_type;
150 
153  in_range_functor(const uint32 _begin, const uint32 _end) : begin(_begin), end(_end) {}
154 
158  bool operator() (const uint32 i) const { return i >= begin && i < end; }
159 
160  const uint32 begin, end;
161 };
162 
165 struct minus_one
166 {
169 
173  uint32 operator() (const uint32 i) const { return i - 1; }
174 };
175 
179 {
182 
186  offset_functor(const uint32 _offset) : offset(_offset) {}
187 
191  uint32 operator() (const uint32 i) const { return i + offset; }
192 
193  const uint32 offset;
194 };
195 
199 {
202 
206  add_divide_functor(const uint32 _a, const uint32 _k) : a(_a), k(_k) {}
207 
211  uint32 operator() (const uint32 i) const { return (i + a) / k; }
212 
213  const uint32 a;
214  const uint32 k;
215 };
216 
219 template <typename string_set_type>
221 {
224 
228  length_functor(const string_set_type _string_set, const bool _extended) : string_set(_string_set), extended(_extended) {}
229 
233  uint32 operator() (const uint32 i) const
234  {
235  return string_set[i].length() + (extended ? 1u : 0u);
236  }
237 
238  string_set_type string_set;
239  bool extended;
240 };
241 
245 {
246  typedef uint2 argument_type;
247  typedef uint2 result_type;
248 
252  suffix_offset_functor(const uint32 _offset) : offset(_offset) {}
253 
257  uint2 operator() (const uint2 suffix) const { return make_uint2( suffix.x, suffix.y + offset ); }
258 
259  const uint32 offset;
260 };
261 
265 {
268 };
269 
270 template <SuffixComponent COMP>
272 {
273  typedef uint2 argument_type;
275 
279  uint32 operator() (const uint2 suffix) const { return COMP == STRING_ID ? suffix.y : suffix.x; }
280 };
281 
282 template <uint32 WORD_BITS, uint32 DOLLAR_BITS, uint32 SYMBOL_SIZE, typename string_type, typename index_type>
285  const string_type string,
286  const index_type string_len,
287  const index_type suffix_idx,
288  const uint32 w)
289 {
290  const uint32 SYMBOLS_PER_WORD = uint32(WORD_BITS - DOLLAR_BITS)/SYMBOL_SIZE;
291  const uint32 SYMBOL_OFFSET = uint32(WORD_BITS) - SYMBOL_SIZE;
292 
293  uint32 word = 0u;
294  for (uint32 j = 0; j < SYMBOLS_PER_WORD; ++j)
295  {
296  const index_type jj = suffix_idx + w*SYMBOLS_PER_WORD + j;
297  const uint32 c = jj < string_len ? string[jj] : 0u;
298  word |= (c << (SYMBOL_OFFSET - j*SYMBOL_SIZE));
299  }
300 
301  if (DOLLAR_BITS)
302  {
303  // encode the dollar's position in the least significant bits of the word
304  const uint32 dollar_offset =
305  string_len <= suffix_idx + w*SYMBOLS_PER_WORD + SYMBOLS_PER_WORD ? // is there a dollar sign?
306  (string_len < suffix_idx + w*SYMBOLS_PER_WORD) ? 0u :
307  uint32(string_len - suffix_idx - w*SYMBOLS_PER_WORD) :
308  (1u << DOLLAR_BITS)-1u; // no dollar sign in this word
309 
310  return word | dollar_offset;
311  }
312  else
313  return word;
314 }
315 
318 template <uint32 SYMBOL_SIZE, uint32 WORD_BITS, uint32 DOLLAR_BITS>
321 {
322  const uint32 SYMBOLS_PER_WORD = (WORD_BITS - DOLLAR_BITS)/SYMBOL_SIZE;
323  return SYMBOLS_PER_WORD;
324 }
325 
326 template <uint32 WORD_BITS, uint32 DOLLAR_BITS, uint32 SYMBOL_SIZE, typename storage_type, typename index_type, typename sufindex_type>
328 typename std::iterator_traits<storage_type>::value_type extract_word_packed(
329  const storage_type base_words,
330  const index_type string_len,
331  const index_type string_off,
332  const sufindex_type suffix_idx,
333  const uint32 w)
334 {
335  typedef typename std::iterator_traits<storage_type>::value_type word_type;
336 
337  const uint32 STORAGE_BITS = uint32( 8u * sizeof(word_type) );
338  const uint32 STORAGE_SYMBOLS = STORAGE_BITS / SYMBOL_SIZE;
339  const uint32 SYMBOLS_PER_WORD = uint32(WORD_BITS - DOLLAR_BITS)/SYMBOL_SIZE;
340  //const uint32 SYMBOL_OFFSET = uint32(WORD_BITS) - SYMBOL_SIZE;
341 
342  const sufindex_type suffix_off = suffix_idx + w*SYMBOLS_PER_WORD; // suffix offset
343 
344  // do we have any symbols to encode?
345  if (suffix_off >= string_len)
346  return 0u;
347 
348  const index_type range_len = string_len - suffix_off; // partial suffix length
349  const index_type range_off = string_off + suffix_off; // partial suffix offset
350 
351  const uint32 n_symbols = (uint32)nvbio::min( range_len, index_type(SYMBOLS_PER_WORD) ); // symbols to pack
352 
353  //
354  // As SYMBOLS_PER_WORD is less than 32, we know that the symbols we are looking for
355  // will span at most 2 32-bit words.
356  // Of the n_symbols we want to read, there might be n1 in the first word, and n2 in the
357  // second word.
358  // As the beginning of our symbol stream (range_off) might stride the 32-bit word boundary,
359  // the highest m1 = range_off % 16 symbols of the first word might have to be discarded,
360  // and we'll find our n1 symbols in the following position:
361  //
362  // |-------------------------------------------|
363  // |* * * * * *| x x x x x x x x x | * * * * * |
364  // |-------------------------------------------|
365  // | m1 | n1 | r1 |
366  //
367  // What we do is shifting the n1 symbols to the top of the 32-bit word (i.e. m1 to the left).
368  // Clearing the remaining symbols is only needed if n1 == n_symbols; if n1 < n_symbols, r1 will
369  // be necessarily zero.
370  //
371  // At this point, we might have n2 more symbols to read in the highest bits of the second word:
372  //
373  // |-------------------------------------------|
374  // | y y y y y y y y | * * * * * * * * * * * * |
375  // |-------------------------------------------|
376  // | n2 | r2 |
377  //
378  // which we need to shift right by (n1*SYMBOL_SIZE) bits.
379  // At the very end, we'll shift everything right by (32 - WORD_BITS) bits in order to have
380  // our output tightly packed in the lowest WORD_BITS:
381  //
382  // 32 WORD_BITS DOLLAR_BITS 0
383  // |-----------|-----------------------|-------|
384  // | * * * * * | x x x x | y y y | 0 0 | $ $ $ | // notice the possible presence of 0's before
385  // |-------------------------------------------| // the $ sign: these are bits that need to be
386  // | | n1 | n2 | | | // cleared if the suffix is short
387  //
388 
389  const uint32 k1 = uint32( range_off/STORAGE_SYMBOLS ); // index of the first word
390 
391  const uint32 m1 = range_off & (STORAGE_SYMBOLS-1); // offset in the word
392  const uint32 r1 = STORAGE_SYMBOLS - m1; // symbols to read
393  const word_type word1 = (base_words[ k1 ] << (m1*SYMBOL_SIZE)); // fetch the first word, shifted left
394 
395  word_type word = word1;
396 
397  if (n_symbols > r1) // do we need to read another word?
398  {
399  const word_type word2 = base_words[ k1+1u ]; // fetch the second word
400  word |= word2 >> (r1*SYMBOL_SIZE); // shift by n1 symbols to the right
401  }
402 
403  word >>= (STORAGE_BITS - WORD_BITS); // align the top to WORD_BITS
404 
405  // clear every symbol we don't need among the word's LSD
406  word &= clearmask<word_type>( WORD_BITS - n_symbols*SYMBOL_SIZE );
407 
408  if (DOLLAR_BITS)
409  {
410  // encode the dollar's position in the least significant bits of the word
411  const word_type dollar_offset =
412  range_len <= SYMBOLS_PER_WORD ? // is there a dollar sign?
413  range_len :
414  (1u << DOLLAR_BITS)-1u; // no dollar sign in this word
415 
416  return word | dollar_offset;
417  }
418  else
419  return word;
420 }
421 
422 template <uint32 WORD_BITS, uint32 DOLLAR_BITS, uint32 SYMBOL_SIZE, typename storage_type, typename index_type, typename sufindex_type, typename output_iterator>
425  const storage_type base_words,
426  const index_type string_len,
427  const index_type string_off,
428  const sufindex_type suffix_idx,
429  const uint32 word_begin,
430  const uint32 word_end,
431  output_iterator words)
432 {
433  typedef typename std::iterator_traits<storage_type>::value_type word_type;
434 
435  const uint32 STORAGE_BITS = uint32( 8u * sizeof(word_type) );
436  const uint32 STORAGE_SYMBOLS = STORAGE_BITS / SYMBOL_SIZE;
437  const uint32 SYMBOLS_PER_WORD = uint32(WORD_BITS - DOLLAR_BITS)/SYMBOL_SIZE;
438  //const uint32 SYMBOL_OFFSET = uint32(WORD_BITS) - SYMBOL_SIZE;
439 
440  sufindex_type suffix_off = suffix_idx + word_begin*SYMBOLS_PER_WORD; // suffix offset
441 
442  index_type range_len = string_len - suffix_off; // partial suffix length
443  index_type range_off = string_off + suffix_off; // partial suffix offset
444 
445  const uint32 cache_begin = uint32( range_off / STORAGE_SYMBOLS );
446 
447  #if defined(PLATFORM_X86) && !defined(NVBIO_DEVICE_COMPILATION)
448  // use SSE to load all the words we need in a small cache
449  const uint32 SSE_WORDS = 16u / sizeof( word_type );
450  const uint32 cache_end = uint32( (range_off + (word_end - word_begin)*SYMBOLS_PER_WORD) / STORAGE_SYMBOLS );
451 
452  __m128i sse_cache[8];
453  for (uint32 w = cache_begin; w < cache_end; w += SSE_WORDS)
454  sse_cache[ (w - cache_begin)/SSE_WORDS ] = _mm_loadu_si128( (const __m128i*)(base_words + w) );
455 
456  const word_type* cached_words = (const word_type*)sse_cache;
457  #elif 0
458  const_cached_iterator<storage_type> cached_words( base_words + cache_begin );
459  #else
460  const storage_type cached_words = base_words + cache_begin;
461  #endif
462 
463  for (uint32 w = word_begin; w < word_end; ++w)
464  {
465  // do we have any symbols to encode?
466  if (suffix_off >= string_len)
467  {
468  words[w - word_begin] = 0u;
469  continue;
470  }
471 
472  const uint32 n_symbols = (uint32)nvbio::min( range_len, index_type(SYMBOLS_PER_WORD) ); // symbols to pack
473 
474  //
475  // As SYMBOLS_PER_WORD is less than 32, we know that the symbols we are looking for
476  // will span at most 2 32-bit words.
477  // Of the n_symbols we want to read, there might be n1 in the first word, and n2 in the
478  // second word.
479  // As the beginning of our symbol stream (range_off) might stride the 32-bit word boundary,
480  // the highest m1 = range_off % 16 symbols of the first word might have to be discarded,
481  // and we'll find our n1 symbols in the following position:
482  //
483  // |-------------------------------------------|
484  // |* * * * * *| x x x x x x x x x | * * * * * |
485  // |-------------------------------------------|
486  // | m1 | n1 | r1 |
487  //
488  // What we do is shifting the n1 symbols to the top of the 32-bit word (i.e. m1 to the left).
489  // Clearing the remaining symbols is only needed if n1 == n_symbols; if n1 < n_symbols, r1 will
490  // be necessarily zero.
491  //
492  // At this point, we might have n2 more symbols to read in the highest bits of the second word:
493  //
494  // |-------------------------------------------|
495  // | y y y y y y y y | * * * * * * * * * * * * |
496  // |-------------------------------------------|
497  // | n2 | r2 |
498  //
499  // which we need to shift right by (n1*SYMBOL_SIZE) bits.
500  // At the very end, we'll shift everything right by (32 - WORD_BITS) bits in order to have
501  // our output tightly packed in the lowest WORD_BITS:
502  //
503  // 32 WORD_BITS DOLLAR_BITS 0
504  // |-----------|-----------------------|-------|
505  // | * * * * * | x x x x | y y y | 0 0 | $ $ $ | // notice the possible presence of 0's before
506  // |-------------------------------------------| // the $ sign: these are bits that need to be
507  // | | n1 | n2 | | | // cleared if the suffix is short
508  //
509  const uint32 k1 = uint32( range_off/STORAGE_SYMBOLS ) - cache_begin; // index of the first word
510 
511  const uint32 m1 = range_off & (STORAGE_SYMBOLS-1); // offset in the word
512  const uint32 r1 = STORAGE_SYMBOLS - m1; // symbols left in the word
513  const word_type word1 = (cached_words[ k1 ] << (m1*SYMBOL_SIZE)); // fetch the first word, shifted left
514 
515  word_type word = word1;
516 
517  if (n_symbols > r1) // do we need to read another word?
518  {
519  const word_type word2 = cached_words[ k1+1u ]; // fetch the second word
520  word |= word2 >> (r1*SYMBOL_SIZE); // shift by n1 symbols to the right
521  }
522 
523  word >>= (STORAGE_BITS - WORD_BITS); // align the top to WORD_BITS
524 
525  // clear every symbol we don't need among the word's LSD
526  word &= clearmask<word_type>( WORD_BITS - n_symbols*SYMBOL_SIZE );
527 
528  if (DOLLAR_BITS)
529  {
530  // encode the dollar's position in the least significant bits of the word
531  const word_type dollar_offset =
532  range_len <= SYMBOLS_PER_WORD ? // is there a dollar sign?
533  range_len :
534  (1u << DOLLAR_BITS)-1u; // no dollar sign in this word
535 
536  word |= dollar_offset;
537  }
538 
539  // write the word out
540  words[ w - word_begin ] = word;
541 
542  suffix_off += SYMBOLS_PER_WORD;
543  range_len -= SYMBOLS_PER_WORD;
544  range_off += SYMBOLS_PER_WORD;
545  }
546 }
547 
551 {
553  typedef uint2 result_type;
554 
558  localize_suffix_functor(const uint32* _cum_lengths, const uint32* _string_ids, const uint32 _string_offset = 0u) :
559  cum_lengths(_cum_lengths),
560  string_ids(_string_ids),
561  string_offset( _string_offset ) {}
562 
566  result_type operator() (const uint32 global_suffix_idx) const
567  {
568  const uint32 string_idx = string_ids[ global_suffix_idx ];
569  const uint32 suffix_idx = global_suffix_idx - (string_idx ? cum_lengths[ string_idx-1u ] : 0u);
570 
571  return make_uint2( suffix_idx, string_offset + string_idx );
572  }
573 
577 };
578 
581 template <uint32 SYMBOL_SIZE, uint32 WORD_BITS, uint32 DOLLAR_BITS, typename string_set_type, typename word_type>
583 {
584  typedef uint2 argument_type;
585  typedef word_type result_type;
586 
590  local_set_suffix_word_functor(const string_set_type _string_set, const uint32 _w) :
591  string_set(_string_set),
592  w(_w) {}
593 
597  result_type operator() (const uint2 local_suffix_idx) const
598  {
599  typedef typename string_set_type::string_type string_type;
600 
601  const uint32 string_idx = local_suffix_idx.y;
602  const uint32 suffix_idx = local_suffix_idx.x;
603 
604  const string_type string = string_set[string_idx];
605  const uint32 string_len = string.length();
606 
607  return result_type( extract_word_generic<WORD_BITS,DOLLAR_BITS,SYMBOL_SIZE>(
608  string,
609  string_len,
610  suffix_idx,
611  w ) );
612  }
613 
614  string_set_type string_set;
616 };
617 
620 template <uint32 SYMBOL_SIZE, uint32 WORD_BITS, uint32 DOLLAR_BITS, typename storage_type, typename word_type, typename offsets_iterator>
622  SYMBOL_SIZE, WORD_BITS, DOLLAR_BITS,
624  PackedStream<storage_type,uint8,SYMBOL_SIZE,true,typename std::iterator_traits<offsets_iterator>::value_type>,
625  offsets_iterator>,
626  word_type>
627 {
628  typedef typename std::iterator_traits<offsets_iterator>::value_type index_type;
629  typedef ConcatenatedStringSet<
631  offsets_iterator> string_set_type;
632 
633  typedef uint2 argument_type;
634  typedef word_type result_type;
635 
639  local_set_suffix_word_functor(const string_set_type _string_set, const uint32 _w) :
640  string_set(_string_set),
641  w(_w) {}
642 
646  result_type operator() (const uint2 local_suffix_idx) const
647  {
648  typedef typename string_set_type::string_type string_type;
649 
650  const uint32 string_idx = local_suffix_idx.y;
651  const uint32 suffix_idx = local_suffix_idx.x;
652 
653  const index_type string_off = string_set.offsets()[ string_idx ];
654  const index_type string_end = string_set.offsets()[ string_idx+1u ];
655  const index_type string_len = uint32( string_end - string_off );
656 
657  const storage_type base_words = string_set.base_string().stream();
658 
659  return result_type( extract_word_packed<WORD_BITS,DOLLAR_BITS,SYMBOL_SIZE>(
660  base_words,
661  string_len,
662  string_off,
663  suffix_idx,
664  w ) );
665  }
666 
669 };
670 
673 template <uint32 SYMBOL_SIZE, uint32 WORD_BITS, uint32 DOLLAR_BITS, typename string_set_type, typename word_type>
675 {
677  typedef word_type result_type;
678 
682  global_set_suffix_word_functor(const string_set_type _string_set, const uint32* _cum_lengths, const uint32* _string_ids, const uint32 _w) :
683  word_functor( _string_set, _w ),
684  localizer( _cum_lengths, _string_ids ) {}
685 
689  result_type operator() (const uint32 global_suffix_idx) const
690  {
691  return word_functor( localizer( global_suffix_idx ) );
692  }
693 
696 };
697 
700 template <uint32 SYMBOL_SIZE, uint32 WORD_BITS, uint32 DOLLAR_BITS, typename string_type, typename word_type>
702 {
704  typedef word_type result_type;
705 
709  string_suffix_word_functor(const uint64 _string_len, const string_type _string, const uint32 _w) :
710  string_len(_string_len),
711  string(_string),
712  w(_w) {}
713 
717  result_type operator() (const uint64 suffix_idx) const
718  {
719  return result_type( extract_word_generic<WORD_BITS,DOLLAR_BITS,SYMBOL_SIZE>(
720  string,
721  string_len,
722  suffix_idx,
723  w ) );
724  }
725 
727  string_type string;
729 };
730 
733 template <uint32 SYMBOL_SIZE, uint32 WORD_BITS, uint32 DOLLAR_BITS, typename storage_type, typename symbol_type, typename index_type, typename word_type>
735  SYMBOL_SIZE, WORD_BITS, DOLLAR_BITS,
736  PackedStream<storage_type,symbol_type,SYMBOL_SIZE,true,index_type>,
737  word_type>
738 {
740  typedef uint2 argument_type;
741  typedef word_type result_type;
742 
746  string_suffix_word_functor(const index_type _string_len, const string_type _string, const uint32 _w) :
747  string_len(_string_len),
748  string(_string),
749  w(_w) {}
750 
754  result_type operator() (const index_type suffix_idx) const
755  {
756  const storage_type base_words = string.stream();
757 
758  return result_type( extract_word_packed<WORD_BITS,DOLLAR_BITS,SYMBOL_SIZE>(
759  base_words,
760  string_len,
761  string.index(),
762  suffix_idx,
763  w ) );
764  }
765 
766  const index_type string_len;
769 };
770 
773 template <typename string_type>
775 {
779 
783  string_suffix_difference(const uint64 _string_len, const string_type _string, const uint32 _cmp_len) :
784  string_len(_string_len),
785  string(_string),
786  cmp_len( _cmp_len ) {}
787 
791  result_type operator() (const uint64 suffix_idx1, const uint64 suffix_idx2) const
792  {
793  // if one of the two suffixes is less than cmp_len, then the two suffixes must
794  // necessarily differ (because no two suffixes have the same length)
795  if (string_len - suffix_idx1 < cmp_len ||
796  string_len - suffix_idx2 < cmp_len)
797  return 1u;
798 
799  for (uint32 i = 0; i < cmp_len; ++i)
800  {
801  if (string[suffix_idx1 + i] != string[suffix_idx2 + i])
802  return 1u;
803  }
804  return 0u;
805  }
806 
808  string_type string;
810 };
811 
814 template <uint32 SYMBOL_SIZE, typename string_type>
816 {
820 
824  string_suffix_less(const uint64 _string_len, const string_type _string) :
825  string_len(_string_len),
826  string(_string) {}
827 
831  result_type operator() (const uint64 suffix_idx1, const uint64 suffix_idx2) const
832  {
833  const uint32 WORD_BITS = 32u; // use 32-bit words
834  const uint32 DOLLAR_BITS = 4u; // 4 is the minimum number needed to encode up to 16 symbols per word
835  const uint32 SYMBOLS_PER_WORD = symbols_per_word<SYMBOL_SIZE,WORD_BITS,DOLLAR_BITS>();
836 
837  const uint32 n_words = uint32( nvbio::min(
838  (string_len - suffix_idx1),
839  (string_len - suffix_idx2) ) + SYMBOLS_PER_WORD-1 ) / SYMBOLS_PER_WORD;
840 
841  // loop through all string-words
842  for (uint32 w = 0; w < n_words; ++w)
843  {
845 
846  const uint32 w1 = word_functor( suffix_idx1 );
847  const uint32 w2 = word_functor( suffix_idx2 );
848  if (w1 < w2) return true;
849  if (w1 > w2) return false;
850  }
851  return false;
852  }
853 
855  string_type string;
856 };
857 
861 template <typename string_type>
863 {
866 
870  string_bwt_functor(const uint64 _string_len, const string_type _string) :
871  string_len(_string_len),
872  string(_string) {}
873 
877  result_type operator() (const argument_type suffix_idx) const
878  {
879  return suffix_idx ? string[suffix_idx-1] : 255u; // use 255u to mark the dollar sign
880  }
881 
883  const string_type string;
884 };
885 
889 template <typename string_set_type>
891 {
892  typedef uint2 argument_type;
894 
898  string_set_bwt_functor(const string_set_type _string_set) :
899  string_set(_string_set) {}
900 
904  result_type operator() (const argument_type local_suffix_idx) const
905  {
906  typedef typename string_set_type::string_type string_type;
907 
908  const uint32 string_idx = local_suffix_idx.y;
909  const uint32 suffix_idx = local_suffix_idx.x;
910 
911  const string_type string = string_set[string_idx];
912 
913  return suffix_idx ? string[suffix_idx-1] : 255u; // use 255u to mark the dollar sign
914  }
915 
919  result_type operator() (const uint32 string_idx) const
920  {
921  typedef typename string_set_type::string_type string_type;
922 
923  const string_type string = string_set[string_idx];
924 
925  return string[ string.length()-1 ];
926  }
927 
928  const string_set_type string_set;
929 };
930 
934 {
938 
944  result_type operator() (const uint32 flag1, const uint32 flag2) const
945  {
946  return (flag1 && flag2) ? 0u : 1u;
947  }
948 };
949 
953 {
957 
964  {
965  return (key << 32u) | second_argument_type( radix );
966  }
967 };
968 
969 /*
970 template <uint32 SYMBOL_SIZE, uint32 WORD_BITS, uint32 DOLLAR_BITS, typename string_set_type>
971 struct dispatch_set_suffix_radices
972 {
973  template <typename radix_iterator>
974  void enact(
975  const string_set_type& string_set,
976  const SetSuffixFlattener<SYMBOL_SIZE>& set_flattener,
977  radix_iterator radices)
978  {
979  typedef typename std::iterator_traits<radix_iterator>::value_type word_type;
980 
981  thrust::transform(
982  thrust::make_counting_iterator(0u),
983  thrust::make_counting_iterator(0u) + set_flattener.n_suffixes,
984  radices,
985  global_set_suffix_word_functor<SYMBOL_SIZE,BITS,DOLLAR_BITS,string_set_type,word_type>(
986  string_set,
987  nvbio::device_view( set_flattener.cum_lengths ),
988  nvbio::device_view( set_flattener.string_ids ),
989  0u ) );
990  }
991 };
992 
993 template <uint32 SYMBOL_SIZE, uint32 WORD_BITS, uint32 DOLLAR_BITS, typename storage_type, typename index_type>
994 struct dispatch_set_suffix_radices<
995  SYMBOL_SIZE, WORD_BITS, DOLLAR_BITS,
996  ConcatenatedStringSet<PackedStream<storage_type,uint8,SYMBOL_SIZE,true,index_type>,index_type*>,
997  word_type>
998 {
999  typedef ConcatenatedStringSet<
1000  PackedStream<storage_type,uint8,SYMBOL_SIZE,true,index_type>,
1001  index_type*> string_set_type;
1002 
1003  template <typename radix_iterator>
1004  void enact(
1005  const string_set_type& string_set,
1006  const SetSuffixFlattener<SYMBOL_SIZE>& set_flattener,
1007  radix_iterator radices)
1008  {
1009  typedef typename std::iterator_traits<radix_iterator>::value_type word_type;
1010 
1011  thrust::transform(
1012  thrust::make_counting_iterator(0u),
1013  thrust::make_counting_iterator(0u) + set_flattener.n_suffixes,
1014  radices,
1015  global_set_suffix_word_functor<SYMBOL_SIZE,BITS,DOLLAR_BITS,string_set_type,word_type>(
1016  string_set,
1017  nvbio::device_view( set_flattener.cum_lengths ),
1018  nvbio::device_view( set_flattener.string_ids ),
1019  word_idx ) );
1020  }
1021 };
1022 */
1023 
1024 template <uint32 BITS, uint32 DOLLAR_BITS>
1025 struct Bits {};
1026 
1030 template <uint32 SYMBOL_SIZE>
1032 {
1034  d_scan_time(0.0f),
1035  d_search_time(0.0f),
1036  m_mgpu( _mgpu ) {}
1037 
1041  {
1042  d_scan_time = 0.0f;
1043  d_search_time = 0.0f;
1044  }
1045 
1048  void reserve(const uint32 n_strings, const uint32 n_suffixes)
1049  {
1050  alloc_storage( cum_lengths, n_strings );
1051  alloc_storage( string_ids, n_suffixes );
1052  }
1053 
1056  uint64 needed_device_memory(const uint32 n_strings, const uint32 n_suffixes) const
1057  {
1058  return (n_strings + n_suffixes) * sizeof(uint32);
1059  }
1060 
1064  template <typename string_set_type>
1065  void set(const string_set_type& string_set, const bool empty_suffixes = true)
1066  {
1067  const uint32 n = string_set.size();
1068 
1069  cuda::Timer timer;
1070  timer.start();
1071 
1072  // compute the cumulative sum of the string lengths in the set - we will use this for
1073  // building the map: (global suffix index -> string index)
1074  alloc_storage( cum_lengths, n );
1075 
1077  n,
1078  thrust::make_transform_iterator( thrust::make_counting_iterator(0u), length_functor<string_set_type>( string_set, empty_suffixes ) ),
1079  cum_lengths.begin(),
1080  thrust::plus<uint32>(),
1081  temp_storage );
1082 
1083  // compute the number of suffixes
1084  n_suffixes = cum_lengths[n-1];
1085 
1086  timer.stop();
1087  d_scan_time += timer.seconds();
1088 
1089  timer.start();
1090 
1091  // assign the string id to each suffix - this is done by a simple binary search on the suffix index
1092  // in the vector of cumulative string lengths
1094 
1095  // find the end of each bin of values
1096  mgpu::SortedSearch<mgpu::MgpuBoundsLower>(
1097  thrust::make_counting_iterator<uint32>(0u),
1098  n_suffixes,
1100  n,
1101  string_ids.begin(),
1102  *m_mgpu );
1103 
1104  timer.stop();
1105  d_search_time += timer.seconds();
1106  }
1107 
1110  template <uint32 BITS, uint32 DOLLAR_BITS, typename string_set_type, typename index_iterator, typename radix_iterator>
1111  void flatten(
1112  const string_set_type& string_set,
1113  const uint32 word_idx,
1114  const Bits<BITS,DOLLAR_BITS> word_bits,
1115  const index_iterator indices,
1116  radix_iterator radices)
1117  {
1118  typedef typename std::iterator_traits<radix_iterator>::value_type word_type;
1119 
1121  indices,
1122  indices + n_suffixes,
1123  radices,
1125  string_set,
1128  word_idx ) );
1129  }
1130 
1133  template <typename string_set_type>
1135  const string_set_type& string_set, const bool empty_suffixes = true)
1136  {
1137  // compute the maximum string length in the set
1138  return cuda::reduce(
1139  uint32( string_set.size() ),
1141  thrust::make_counting_iterator<uint32>(0u),
1142  length_functor<string_set_type>( string_set, empty_suffixes ) ),
1143  thrust::maximum<uint32>(),
1144  temp_storage );
1145  }
1146 
1149  template <typename string_set_type, typename index_iterator>
1151  const string_set_type& string_set,
1152  const index_iterator indices_begin,
1153  const index_iterator indices_end)
1154  {
1155  // TODO: this function is conservative, in the sense it returns the maximum *string* length;
1156  // however, each suffix might be shorter than the string it belongs to.
1157  return indices_end <= indices_begin ? 0u :
1158  cuda::reduce(
1159  indices_end - indices_begin,
1161  thrust::make_permutation_iterator( string_ids.begin(), indices_begin ),
1162  length_functor<string_set_type>( string_set, false ) ),
1163  thrust::maximum<uint32>(),
1164  temp_storage );
1165  }
1166 
1170  {
1171  return
1172  cum_lengths.size() * sizeof(uint32) +
1173  string_ids.size() * sizeof(uint32) +
1174  temp_storage.size() * sizeof(uint8);
1175  }
1176 
1178  thrust::device_vector<uint32> cum_lengths;
1179  thrust::device_vector<uint32> string_ids;
1180  thrust::device_vector<uint8> temp_storage;
1184 
1188 };
1189 
1192 template <uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type, typename offsets_iterator, typename input_tag, typename output_tag>
1193 struct ChunkLoader {};
1194 
1197 template <uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type, typename offsets_iterator>
1198 struct ChunkLoader<SYMBOL_SIZE,BIG_ENDIAN,storage_type,offsets_iterator,host_tag,device_tag>
1199 {
1200  // infer the word type
1201  typedef typename std::iterator_traits<storage_type>::value_type word_type;
1202  typedef typename std::iterator_traits<offsets_iterator>::value_type index_type;
1203 
1205 
1208 
1210 
1211  // infer the word size
1212  static const uint32 SYMBOLS_PER_WORD = uint32(8u*sizeof(word_type))/SYMBOL_SIZE;
1213 
1214  uint64 needed_device_memory(const uint32 max_strings, const uint32 max_symbols) const
1215  {
1216  const uint32 max_words = util::divide_ri( max_symbols, SYMBOLS_PER_WORD ) + 2;
1217 
1218  return (max_strings+1) * sizeof(uint32) +
1219  max_words * sizeof(word_type);
1220  }
1221 
1222  void reserve(const uint32 max_strings, const uint32 max_symbols)
1223  {
1224  const uint32 max_words = util::divide_ri( max_symbols, SYMBOLS_PER_WORD ) + 2;
1225 
1226  alloc_storage( h_chunk_offsets, max_strings+1 );
1227  alloc_storage( d_chunk_offsets, max_strings+1 );
1228  alloc_storage( d_chunk_string, max_words );
1229  }
1230 
1232  const ConcatenatedStringSet<
1234  offsets_iterator> string_set,
1235  const uint32 chunk_begin,
1236  const uint32 chunk_end)
1237  {
1238  const uint32 chunk_size = chunk_end - chunk_begin;
1239 
1240  alloc_storage( h_chunk_offsets, chunk_size+1 );
1241  alloc_storage( d_chunk_offsets, chunk_size+1 );
1242 
1243  // find the words overlapped by the chunk
1244  const uint64 begin_index = string_set.offsets()[ chunk_begin ];
1245  const uint64 end_index = string_set.offsets()[ chunk_end ];
1246  const uint64 begin_word = (begin_index / SYMBOLS_PER_WORD);
1247  const uint64 end_word = (end_index + SYMBOLS_PER_WORD-1) / SYMBOLS_PER_WORD;
1248  const uint32 chunk_words = uint32( end_word - begin_word );
1249 
1250  const word_type* base_words = string_set.base_string().stream();
1251 
1252  alloc_storage( d_chunk_string, chunk_words );
1253 
1254  // copy them to the device
1255  thrust::copy(
1256  base_words + begin_word,
1257  base_words + begin_word + chunk_words,
1258  d_chunk_string.begin() );
1259 
1260  // build the host offsets
1261  uint32 chunk_symbols = uint32( begin_index % SYMBOLS_PER_WORD );
1262  h_chunk_offsets[0] = chunk_symbols;
1263  for (uint32 i = 0; i < chunk_size; ++i)
1264  {
1265  chunk_symbols += string_set[ chunk_begin + i ].size();
1266  h_chunk_offsets[i+1] = chunk_symbols;
1267  }
1268 
1269  // copy the offsets to the device
1270  thrust::copy(
1271  h_chunk_offsets.begin(),
1272  h_chunk_offsets.begin() + chunk_size+1,
1273  d_chunk_offsets.begin() );
1274 
1275  // finally assemble the device chunk string-set
1276  packed_stream_type d_packed_stream( word_pointer( nvbio::plain_view( d_chunk_string ) ) );
1277  return chunk_set_type(
1278  chunk_size,
1279  d_packed_stream.begin(),
1280  nvbio::plain_view( d_chunk_offsets ) );
1281  }
1282 
1283  thrust::host_vector<uint32> h_chunk_offsets;
1284  thrust::device_vector<word_type> d_chunk_string;
1285  thrust::device_vector<uint32> d_chunk_offsets;
1286 };
1287 
1290 template <uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type, typename offsets_iterator, typename system_tag>
1291 struct ChunkLoader<SYMBOL_SIZE,BIG_ENDIAN,storage_type,offsets_iterator,system_tag,system_tag>
1292 {
1293  typedef typename std::iterator_traits<offsets_iterator>::value_type index_type;
1294 
1295  typedef const ConcatenatedStringSet<
1297  offsets_iterator> string_set_type;
1298 
1300 
1302  const string_set_type string_set,
1303  const uint32 chunk_begin,
1304  const uint32 chunk_end)
1305  {
1306  // assemble the device chunk string-set
1307  return chunk_set_type(
1308  uint32( chunk_end - chunk_begin ),
1309  string_set.base_string(),
1310  string_set.offsets() + chunk_begin );
1311  }
1312 };
1313 
1316 template <uint32 SYMBOL_SIZE, uint32 BITS, uint32 DOLLAR_BITS, typename string_type, typename index_iterator, typename radix_iterator>
1318  const uint64 string_len,
1319  const string_type& string,
1320  const uint32 word_idx,
1321  const index_iterator indices_begin,
1322  const index_iterator indices_end,
1323  radix_iterator radices)
1324 {
1325  typedef typename std::iterator_traits<radix_iterator>::value_type word_type;
1326 
1328  indices_begin,
1329  indices_end,
1330  radices,
1332  string_len,
1333  string,
1334  word_idx ) );
1335 }
1336 
1339 template <uint32 SYMBOL_SIZE, uint32 N_BITS, uint32 DOLLAR_BITS>
1341 {
1343 
1345 
1349  template <typename suffix_iterator, typename string_type>
1350  void count(
1351  const uint32 n_suffixes,
1352  const suffix_iterator suffixes,
1353  const uint32 string_length,
1354  const string_type& string)
1355  {
1356  cuda::Timer timer;
1357 
1358  const uint32 n_buckets = 1u << (N_BITS);
1359 
1360  // initialize the temporary and output vectors
1361  alloc_storage( d_indices, n_suffixes * 2u );
1362  alloc_storage( d_radices, n_suffixes * 2u );
1363  alloc_storage( d_buckets, n_buckets );
1364 
1365  timer.start();
1366 
1367  // extract the first radix word from each of the suffixes
1368  flatten_string_suffixes<SYMBOL_SIZE, N_BITS,DOLLAR_BITS>(
1369  string_length,
1370  string,
1371  0u, // load the first word
1372  suffixes,
1373  suffixes + n_suffixes,
1374  d_radices.begin() );
1375 
1376  timer.stop();
1377  d_flatten_time += timer.seconds();
1378 
1379  timer.start();
1380 
1381  // sort the radices so as to make binning easy
1382  cuda::SortBuffers<word_type*> sort_buffers;
1383  cuda::SortEnactor sort_enactor;
1384 
1385  sort_buffers.selector = 0;
1386  sort_buffers.keys[0] = nvbio::device_view( d_radices );
1387  sort_buffers.keys[1] = nvbio::device_view( d_radices ) + n_suffixes;
1388  sort_enactor.sort( n_suffixes, sort_buffers, 0u, N_BITS );
1389  //thrust::sort( d_radices.begin(), d_radices.begin() + n_suffixes );
1390 
1391  timer.stop();
1392  d_count_sort_time += timer.seconds();
1393 
1394  // initialize the bucket counters
1395  thrust::fill( d_buckets.begin(), d_buckets.end(), 0u );
1396 
1397  // compute the number of effectively used buckets looking at the last non-empty one
1398  const uint32 n_used_buckets = d_radices[ sort_buffers.selector * n_suffixes + n_suffixes-1 ] + 1u;
1399 
1400  // find the end of each bin of values
1402  d_radices.begin() + sort_buffers.selector * n_suffixes,
1403  d_radices.begin() + sort_buffers.selector * n_suffixes + n_suffixes,
1404  thrust::make_counting_iterator<uint32>(0u),
1405  thrust::make_counting_iterator<uint32>(0u) + n_used_buckets,
1406  d_buckets.begin() );
1407 
1408  // compute the histogram by taking differences of the cumulative histogram
1409  thrust::adjacent_difference(
1410  d_buckets.begin(), d_buckets.begin() + n_used_buckets,
1411  d_buckets.begin());
1412  }
1413 
1417  template <typename suffix_iterator, typename string_type, typename bucketmap_iterator, typename output_iterator>
1419  const uint32 n_suffixes,
1420  const suffix_iterator suffixes,
1421  const uint64 string_length,
1422  const string_type& string,
1423  const uint32 bucket_begin,
1424  const uint32 bucket_end,
1425  const bucketmap_iterator bucketmap,
1426  output_iterator output_radices,
1427  output_iterator output_indices)
1428  {
1429  cuda::Timer timer;
1430 
1431  const uint32 n_buckets = 1u << N_BITS;
1432 
1433  // initialize the temporary and output vectors
1434  alloc_storage( d_indices, n_suffixes * 2u );
1435  alloc_storage( d_radices, n_suffixes * 2u );
1436  alloc_storage( d_buckets, n_buckets );
1437 
1438  timer.start();
1439 
1440  // extract the first radix word from each of the suffixes
1441  flatten_string_suffixes<SYMBOL_SIZE,N_BITS,DOLLAR_BITS>(
1442  string_length,
1443  string,
1444  0u, // load the first word
1445  suffixes,
1446  suffixes + n_suffixes,
1447  d_radices.begin() );
1448 
1449  timer.stop();
1450  d_flatten_time += timer.seconds();
1451 
1452  timer.start();
1453 
1454  // determine if a radix is in the given bucket range
1455  const priv::in_range_functor in_range = priv::in_range_functor( bucket_begin, bucket_end );
1456 
1457  // retain only suffixes whose radix is between the specified buckets
1458  const uint32 n_collected = cuda::copy_flagged(
1459  n_suffixes,
1460  thrust::make_zip_iterator( thrust::make_tuple( suffixes, d_radices.begin() ) ),
1461  thrust::make_transform_iterator( d_radices.begin(), in_range ),
1462  thrust::make_zip_iterator( thrust::make_tuple( d_indices.begin(), d_radices.begin() ) ) + n_suffixes,
1463  d_temp_storage );
1464 
1465  timer.stop();
1466  d_filter_time += timer.seconds();
1467 
1468  timer.start();
1469 
1470  // remap the collected radices
1471  thrust::gather(
1472  d_radices.begin() + n_suffixes,
1473  d_radices.begin() + n_suffixes + n_collected,
1474  bucketmap,
1475  d_radices.begin() + n_suffixes );
1476 
1477  timer.stop();
1478  d_remap_time += timer.seconds();
1479 
1480  timer.start();
1481 
1482  // sort the radices so as to make binning easy
1484  cuda::SortEnactor sort_enactor;
1485 
1486  sort_buffers.selector = 0;
1487  //#define SORT_BY_BUCKETS
1488  #if defined(SORT_BY_BUCKETS)
1489  sort_buffers.keys[0] = nvbio::device_view( d_radices ) + n_suffixes;
1490  sort_buffers.keys[1] = nvbio::device_view( d_radices );
1491  sort_buffers.values[0] = (uint64*)nvbio::device_view( d_indices ) + buffer_stride;
1492  sort_buffers.values[1] = (uint64*)nvbio::device_view( d_indices );
1493  sort_enactor.sort( n_collected, sort_buffers, 0u, N_BITS );
1494  #endif
1495 
1496  timer.stop();
1497  d_collect_sort_time += timer.seconds();
1498 
1499  //
1500  // copy all the indices inside the range to the output
1501  //
1502 
1503  //alloc_storage( output_suffixes, n_suffixes );
1504  //alloc_storage( output_radices, n_suffixes );
1505 
1506  timer.start();
1507 
1508  // the buffer selector had inverted semantics
1509  sort_buffers.selector = 1 - sort_buffers.selector;
1510 
1511  // and copy everything to the output
1512  thrust::copy(
1513  d_indices.begin() + sort_buffers.selector * n_suffixes,
1514  d_indices.begin() + sort_buffers.selector * n_suffixes + n_collected,
1515  output_indices );
1516 
1517  // and copy everything to the output
1518  thrust::copy(
1519  d_radices.begin() + sort_buffers.selector * n_suffixes,
1520  d_radices.begin() + sort_buffers.selector * n_suffixes + n_collected,
1521  output_radices );
1522 
1523  timer.stop();
1524  d_copy_time += timer.seconds();
1525 
1526  return n_collected;
1527  }
1528 
1529  thrust::device_vector<uint32> d_indices;
1530  thrust::device_vector<word_type> d_radices;
1531  thrust::device_vector<uint32> d_buckets;
1532  thrust::device_vector<uint8> d_temp_storage;
1540 };
1541 
1542 //
1543 // A host-side radix extractor context
1544 //
1545 template <typename string_set_type, uint32 SYMBOL_SIZE, uint32 DOLLAR_BITS, uint32 WORD_BITS>
1547 {
1548  HostStringSetRadices(const string_set_type string_set) : m_string_set( string_set ) {}
1549 
1552  uint32 num_words(const uint32 max_string_len) const
1553  {
1554  const uint32 SYMBOLS_PER_WORD = priv::symbols_per_word<SYMBOL_SIZE,WORD_BITS,DOLLAR_BITS>();
1555 
1556  return (max_string_len + SYMBOLS_PER_WORD-1) / SYMBOLS_PER_WORD;
1557  }
1558 
1561  uint64 needed_device_memory(const uint32 n_suffixes) const
1562  {
1563  return n_suffixes * sizeof(uint8) + // d_symbols
1564  n_suffixes * sizeof(uint32); // d_active_suffixes
1565  }
1566 
1569  void reserve(const uint32 n_suffixes, const uint32 block_size)
1570  {
1571  try
1572  {
1573  d_active_suffixes.resize( n_suffixes );
1574  d_symbols.resize( n_suffixes );
1575  h_symbols.resize( n_suffixes );
1576  h_active_suffixes.resize( n_suffixes );
1577  m_block.resize( n_suffixes * block_size );
1578  }
1579  catch (...)
1580  {
1581  log_error(stderr, "HostStringSetRadices::reserve() : allocation failed!\n");
1582  throw;
1583  }
1584  }
1585 
1588  void init(const uint32 n_suffixes, const uint2* _h_suffixes, const uint2* _d_suffixes)
1589  {
1590  m_suffixes = _h_suffixes;
1591  d_suffixes = thrust::device_ptr<const uint2>( _d_suffixes );
1592  }
1593 
1597  const uint32 n_indices,
1598  const uint32* d_indices,
1599  const uint32 word_block_begin,
1600  const uint32 word_block_end)
1601  {
1602  try
1603  {
1604  if (d_indices == NULL)
1605  {
1606  // extract the given radix word from each of the partially sorted suffixes on the host
1608  m_string_set,
1609  n_indices,
1610  word_block_begin,
1611  word_block_end,
1612  WORD_BITS,
1613  m_suffixes,
1614  &m_block[0],
1615  word_block_begin == 0 ? &h_symbols[0] : NULL ); // on the first iteration, load the BWT symbols too
1616  }
1617  else
1618  {
1619  // gather the list of active suffixes
1620  thrust::gather(
1621  thrust::device_ptr<const uint32>( d_indices ),
1622  thrust::device_ptr<const uint32>( d_indices ) + n_indices,
1623  d_suffixes,
1624  d_active_suffixes.begin() );
1625 
1626  // copy the list of active suffixes to the host
1627  thrust::copy(
1628  d_active_suffixes.begin(),
1629  d_active_suffixes.begin() + n_indices,
1630  h_active_suffixes.begin() );
1631 
1632  // extract the given radix word from each of the partially sorted suffixes on the host
1634  m_string_set,
1635  n_indices,
1636  word_block_begin,
1637  word_block_end,
1638  WORD_BITS,
1639  &h_active_suffixes[0],
1640  &m_block[0] );
1641  }
1642  }
1643  catch (...)
1644  {
1645  log_error(stderr, "HostStringSetRadices::init_slice() : exception caught!\n");
1646  throw;
1647  }
1648  }
1649 
1659  void extract(
1660  const uint32 n_indices,
1661  const uint32* d_indices,
1662  const uint32 word_idx,
1663  const uint32 word_block_begin,
1664  const uint32 word_block_end,
1665  uint32* d_radices) const
1666  {
1667  try
1668  {
1669  // and copy them to the device
1670  thrust::copy(
1671  m_block.begin() + n_indices * (word_idx - word_block_begin),
1672  m_block.begin() + n_indices * (word_idx - word_block_begin) + n_indices,
1673  thrust::device_ptr<uint32>( d_radices ) );
1674  }
1675  catch (...)
1676  {
1677  log_error(stderr, "HostStringSetRadices::extract() : exception caught!\n");
1678  throw;
1679  }
1680  }
1681 
1685  const uint32 begin,
1686  const uint32 end,
1687  uint8* h_bwt)
1688  {
1689  const int n_strings = int( end - begin );
1690 
1691  // fetch the BWT symbols for the given strings
1692  #pragma omp parallel for
1693  for (int i = 0; i < n_strings; ++i)
1694  {
1696  h_bwt[i] = bwt( i + begin );
1697  }
1698  }
1699 
1702  void bwt(
1703  const uint32 n_suffixes,
1704  const uint32* d_indices,
1705  uint8* h_bwt,
1706  uint8* d_bwt)
1707  {
1708  try
1709  {
1710  if (d_indices != NULL)
1711  {
1712  #if 0
1713  // fetch the BWT symbols for this block of suffixes
1714  #pragma omp parallel for
1715  for (int i = 0; i < n_suffixes; ++i)
1716  {
1718  h_symbols[i] = bwt( m_suffixes[i] );
1719  }
1720  #endif
1721 
1722  #if 0
1723  alloc_storage( m_block, n_suffixes );
1724 
1725  // re-purpose the radix-block storage
1726  uint32* h_indices = &m_block[0];
1727 
1728  // copy the sorted indices to the host
1729  thrust::copy(
1730  thrust::device_ptr<const uint32>( d_indices ),
1731  thrust::device_ptr<const uint32>( d_indices ) + n_suffixes,
1732  h_indices );
1733 
1734  // and compute the bwt of the block by gathering the symbols in suffix-sorted order
1735  thrust::gather(
1736  h_indices,
1737  h_indices + n_suffixes,
1738  h_symbols.begin(),
1739  h_bwt );
1740  #else
1741  alloc_storage( d_symbols, n_suffixes );
1742 
1743  // copy the symbols to the device
1744  thrust::copy(
1745  h_symbols.begin(),
1746  h_symbols.begin() + n_suffixes,
1747  d_symbols.begin() );
1748 
1749  // gather the symbols in proper order
1750  thrust::gather(
1751  thrust::device_ptr<const uint32>( d_indices ),
1752  thrust::device_ptr<const uint32>( d_indices ) + n_suffixes,
1753  d_symbols.begin(),
1754  thrust::device_ptr<uint8>( d_bwt ) );
1755 
1756  // and copy the sorted symbols back to the host
1757  thrust::copy(
1758  thrust::device_ptr<uint8>( d_bwt ),
1759  thrust::device_ptr<uint8>( d_bwt ) + n_suffixes,
1760  h_bwt );
1761  #endif
1762  }
1763  else
1764  {
1765  // fetch the BWT symbols for this block of suffixes
1766  #pragma omp parallel for
1767  for (int i = 0; i < n_suffixes; ++i)
1768  {
1770  h_bwt[i] = bwt( m_suffixes[i] );
1771  }
1772 
1773  // and copy the sorted symbols back to the device
1774  thrust::copy(
1775  h_bwt,
1776  h_bwt + n_suffixes,
1777  thrust::device_ptr<uint8>( d_bwt ) );
1778  }
1779  }
1780  catch (...)
1781  {
1782  log_error(stderr, "HostStringSetRadices::bwt() : exception caught!\n");
1783  throw;
1784  }
1785  }
1786 
1790  {
1791  return
1792  d_active_suffixes.size() * sizeof(uint2) +
1793  d_symbols.size() * sizeof(uint8);
1794  }
1795 
1799  {
1800  return
1801  m_block.size() * sizeof(uint2) +
1802  h_active_suffixes.size() * sizeof(uint2) +
1803  h_symbols.size() * sizeof(uint8);
1804  }
1805 
1806  string_set_type m_string_set;
1807  const uint2* m_suffixes;
1808  thrust::device_ptr<const uint2> d_suffixes;
1809  thrust::device_vector<uint2> d_active_suffixes;
1810  thrust::device_vector<uint8> d_symbols;
1811  thrust::host_vector<uint2> h_active_suffixes;
1812  thrust::host_vector<uint8> h_symbols;
1813  thrust::host_vector<uint32> m_block;
1814 };
1815 
1816 //
1817 // A host-side radix extractor context
1818 //
1819 template <typename string_set_type, uint32 SYMBOL_SIZE, uint32 DOLLAR_BITS, uint32 WORD_BITS>
1821 {
1823  DeviceStringSetRadices(const string_set_type string_set) : m_string_set( string_set ) {}
1824 
1827  uint32 num_words(const uint32 max_string_len) const
1828  {
1829  const uint32 SYMBOLS_PER_WORD = priv::symbols_per_word<SYMBOL_SIZE,WORD_BITS,DOLLAR_BITS>();
1830 
1831  return (max_string_len + SYMBOLS_PER_WORD-1) / SYMBOLS_PER_WORD;
1832  }
1833 
1836  uint64 needed_device_memory(const uint32 n_suffixes) const
1837  {
1838  return n_suffixes; // d_symbols
1839  }
1840 
1843  void reserve(const uint32 n_suffixes, const uint32 slice_size)
1844  {
1845  d_symbols.resize( n_suffixes );
1846  }
1847 
1850  void set(const string_set_type string_set) { m_string_set = string_set; }
1851 
1854  void init(const uint32 n_suffixes, const uint2* _h_suffixes, const uint2* _d_suffixes)
1855  {
1856  d_suffixes = thrust::device_ptr<const uint2>( _d_suffixes );
1857  }
1858 
1862  const uint32 n_indices,
1863  const uint32* d_indices,
1864  const uint32 word_block_begin,
1865  const uint32 word_block_end) {}
1866 
1876  void extract(
1877  const uint32 n_indices,
1878  const uint32* d_indices,
1879  const uint32 word_idx,
1880  const uint32 word_block_begin,
1881  const uint32 word_block_end,
1882  uint32* d_radices) const
1883  {
1884  // extract the given radix word from each of the partially sorted suffixes in a device temp buffer
1886 
1887  if (d_indices == NULL)
1888  {
1889  thrust::copy(
1891  thrust::make_transform_iterator( d_suffixes, word_functor ) + n_indices,
1892  thrust::device_ptr<uint32>( d_radices ) );
1893  }
1894  else
1895  {
1896  thrust::copy(
1897  thrust::make_transform_iterator( thrust::make_permutation_iterator( d_suffixes, thrust::device_ptr<const uint32>( d_indices ) ), word_functor ),
1898  thrust::make_transform_iterator( thrust::make_permutation_iterator( d_suffixes, thrust::device_ptr<const uint32>( d_indices ) ), word_functor ) + n_indices,
1899  thrust::device_ptr<uint32>( d_radices ) );
1900  }
1901  }
1902 
1906  const uint32 begin,
1907  const uint32 end,
1908  uint8* h_bwt)
1909  {
1910  const int n_strings = end - begin;
1911 
1912  alloc_storage( d_symbols, n_strings );
1913 
1914  // fetch the BWT symbols for the given strings
1916  thrust::make_counting_iterator<uint32>(begin),
1917  thrust::make_counting_iterator<uint32>(end),
1918  d_symbols.begin(),
1920 
1921  // and copy the result to the host
1922  thrust::copy(
1923  d_symbols.begin(),
1924  d_symbols.begin() + n_strings,
1925  h_bwt );
1926  }
1927 
1930  void bwt(
1931  const uint32 n_suffixes,
1932  const uint32* d_indices,
1933  uint8* h_bwt,
1934  uint8* d_bwt)
1935  {
1936  if (d_indices != NULL)
1937  {
1938  alloc_storage( d_symbols, n_suffixes );
1939 
1940  // fetch the BWT symbols for this block of suffixes
1942  d_suffixes,
1943  d_suffixes + n_suffixes,
1944  d_symbols.begin(),
1946 
1947  // and compute the bwt of the block by gathering the symbols in suffix-sorted order
1948  thrust::gather(
1949  thrust::device_ptr<const uint32>( d_indices ),
1950  thrust::device_ptr<const uint32>( d_indices ) + n_suffixes,
1951  d_symbols.begin(),
1952  thrust::device_ptr<uint8>( d_bwt ) );
1953  }
1954  else
1955  {
1956  // fetch the BWT symbols for this block of suffixes
1958  d_suffixes,
1959  d_suffixes + n_suffixes,
1960  thrust::device_ptr<uint8>( d_bwt ),
1962  }
1963 
1964  // and copy the result to the host
1965  thrust::copy(
1966  thrust::device_ptr<uint8>( d_bwt ),
1967  thrust::device_ptr<uint8>( d_bwt ) + n_suffixes,
1968  h_bwt );
1969  }
1970 
1974  {
1975  return d_symbols.size() * sizeof(uint8);
1976  }
1977 
1981  {
1982  return 0u;
1983  }
1984 
1985  string_set_type m_string_set;
1986  thrust::device_ptr<const uint2> d_suffixes;
1987  thrust::device_vector<uint8> d_symbols;
1988 };
1989 
1993 {
1997  offset(0),
1998  n_dollars(0) {}
1999 
2002  uint32 extract(
2003  const uint32 n_suffixes,
2004  const uint8* h_bwt,
2005  const uint8* d_bwt,
2006  const uint2* h_suffixes,
2007  const uint2* d_suffixes,
2008  const uint32* d_indices);
2009 
2012 
2013  thrust::device_vector<uint64> d_dollar_ranks;
2014  thrust::device_vector<uint32> d_dollar_indices;
2015  thrust::device_vector<uint64> d_dollars;
2016  thrust::host_vector<uint64> h_dollar_ranks;
2017  thrust::host_vector<uint64> h_dollars;
2018  thrust::device_vector<uint8> d_temp_storage;
2019 };
2020 
2021 // ------------------------------------------------------------------------------------------------------------- //
2022 // the following functions implement device_copy() and device_scatter() - special-purpose functions to copy
2023 // and scatter a set of symbols to a packed stream.
2024 // ------------------------------------------------------------------------------------------------------------- //
2025 
2028 template <typename input_iterator, typename output_iterator, typename index_type>
2030  const uint32 n,
2031  const input_iterator input,
2032  output_iterator output,
2033  const index_type offset)
2034 {
2035  const uint32 thread_id = threadIdx.x + blockIdx.x*blockDim.x;
2036 
2037  if (thread_id < n)
2038  output[offset + thread_id] = input[thread_id];
2039 }
2040 
2043 template <typename input_iterator, typename storage_type, uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename index_type>
2045  const uint32 n,
2046  const input_iterator input,
2048  const index_type offset)
2049 {
2050  const uint32 thread_id = threadIdx.x + blockIdx.x*blockDim.x;
2051 
2052  //
2053  // care must be used to avoid write-conflicts, hence we assign all symbols belonging
2054  // to the same output word to a single thread
2055  //
2057  const uint32 SYMBOLS_PER_WORD = (8u*sizeof(word_type))/SYMBOL_SIZE;
2058 
2059  const uint32 word_offset = uint32( (offset + output.index()) & (SYMBOLS_PER_WORD-1) );
2060  const uint32 elem_begin = thread_id ? (thread_id+0) * SYMBOLS_PER_WORD - word_offset : 0u;
2061  const uint32 elem_end = nvbio::min( (thread_id+1) * SYMBOLS_PER_WORD - word_offset, n );
2062 
2063  if (elem_begin < n)
2064  {
2065  for (uint32 i = elem_begin; i < elem_end; ++i)
2066  output[offset+i] = input[i];
2067  }
2068 }
2069 
2072 template <typename input_iterator, typename output_iterator, typename index_type>
2074 {
2077  static void copy(
2078  const uint32 n,
2079  const input_iterator input,
2080  const output_iterator output,
2081  const index_type offset)
2082  {
2083  const uint32 batch_size = cuda::max_grid_size();
2084  for (uint32 batch_begin = 0; batch_begin < n; batch_begin += batch_size)
2085  {
2086  const uint32 batch_end = nvbio::min( batch_begin + batch_size, n );
2087 
2088  const uint32 blockdim = 128;
2089  const uint32 n_blocks = util::divide_ri( batch_end - batch_begin, blockdim );
2090  simple_device_copy_kernel<<<n_blocks,blockdim>>>( n, input, output, offset );
2091  }
2092  }
2093 };
2094 
2097 template <typename input_iterator, typename storage_type, uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename index_type>
2099  input_iterator,
2100  PackedStream<storage_type,uint8,SYMBOL_SIZE,BIG_ENDIAN,index_type>,
2101  index_type>
2102 {
2104 
2107  static void copy(
2108  const uint32 n,
2109  const input_iterator input,
2110  const output_iterator output,
2111  const index_type offset)
2112  {
2114  const uint32 SYMBOLS_PER_WORD = (8u*sizeof(word_type))/SYMBOL_SIZE;
2115 
2116  const uint32 batch_size = cuda::max_grid_size();
2117  for (uint32 batch_begin = 0; batch_begin < n; batch_begin += batch_size)
2118  {
2119  const uint32 batch_end = nvbio::min( batch_begin + batch_size, n );
2120 
2121  const uint32 blockdim = 128;
2122  const uint32 n_words = util::divide_ri( batch_end - batch_begin, SYMBOLS_PER_WORD ) + 1u;
2123  const uint32 n_blocks = util::divide_ri( n_words, blockdim );
2124 
2125  packed_device_copy_kernel<<<n_blocks,blockdim>>>( batch_end - batch_begin, input, output, offset + batch_begin );
2126  }
2127  }
2128 };
2129 
2132 template <typename input_iterator, typename output_iterator, typename index_type>
2134  const uint32 n,
2135  const input_iterator input,
2136  const output_iterator output,
2137  const index_type offset)
2138 {
2140 }
2141 
2146 template <typename input_iterator, typename slot_iterator, typename range_iterator, typename storage_type, uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename index_type>
2147 __global__ void device_scatter_kernel(
2148  const uint32 begin,
2149  const uint32 end,
2150  const range_iterator ranges,
2151  const input_iterator input,
2152  const slot_iterator slots,
2154 {
2155  const uint32 thread_id = threadIdx.x + blockIdx.x*blockDim.x;
2156  const uint32 idx = thread_id + begin;
2157 
2158  if (idx >= end)
2159  return;
2160 
2161  //
2162  // care must be used to avoid write-conflicts, hence we assign all symbols belonging
2163  // to the same output word to a single thread
2164  //
2165  const uint32 elem_begin = idx ? ranges[ idx-1 ] : 0u;
2166  const uint32 elem_end = ranges[ idx ];
2167 
2168  for (uint32 i = elem_begin; i < elem_end; ++i)
2169  {
2170  const uint32 slot = slots[i];
2171  output[ slot ] = input[i];
2172  }
2173 }
2174 
2175 
2178 template <typename input_iterator, typename slot_iterator, typename output_iterator>
2180 {
2181  static void enact(
2182  const uint32 n,
2183  const input_iterator input,
2184  const slot_iterator slots,
2185  output_iterator output)
2186  {
2187  thrust::scatter(
2188  input,
2189  input + n,
2190  slots,
2191  output );
2192  }
2193 };
2194 
2197 template <typename input_iterator, typename slot_iterator, typename storage_type, uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename index_type>
2199  input_iterator,
2200  slot_iterator,
2201  PackedStream<storage_type,uint8,SYMBOL_SIZE,BIG_ENDIAN,index_type> >
2202 {
2204 
2205  static void enact(
2206  const uint32 n,
2207  const input_iterator input,
2208  const slot_iterator slots,
2209  output_iterator output)
2210  {
2211  // find out a set of ranges of input symbols covering distinct words in the output: this is done
2212  // looking at the words covered by each of the symbols, and reducing together all symbols falling
2213  // in the same word.
2214  thrust::device_vector<uint32> d_ranges( n );
2215  thrust::device_vector<uint32> d_keys( n );
2216 
2218  const uint32 SYMBOLS_PER_WORD = (8u*sizeof(word_type))/SYMBOL_SIZE;
2219 
2220  const uint32 n_ranges = uint32( thrust::reduce_by_key(
2221  thrust::make_transform_iterator( slots, add_divide_functor( output.index(), SYMBOLS_PER_WORD ) ),
2222  thrust::make_transform_iterator( slots + n, add_divide_functor( output.index(), SYMBOLS_PER_WORD ) ),
2223  thrust::make_counting_iterator<uint32>(1u),
2224  d_keys.begin(),
2225  d_ranges.begin(),
2226  thrust::equal_to<uint32>(),
2227  thrust::maximum<uint32>() ).first - d_keys.begin() );
2228 
2229  const uint32 batch_size = cuda::max_grid_size();
2230  for (uint32 batch_begin = 0; batch_begin < n_ranges; batch_begin += batch_size)
2231  {
2232  const uint32 batch_end = nvbio::min( batch_begin + batch_size, n_ranges );
2233 
2234  // at this point we can scatter the identified ranges
2235  const uint32 blockdim = 128;
2236  const uint32 n_blocks = util::divide_ri( batch_end - batch_begin, blockdim );
2237 
2238  device_scatter_kernel<<<n_blocks,blockdim>>>(
2239  batch_begin,
2240  batch_end,
2241  d_ranges.begin(),
2242  input,
2243  slots,
2244  output );
2245  }
2246  }
2247 };
2248 
2251 template <typename input_iterator, typename slot_iterator, typename output_iterator>
2253  const uint32 n,
2254  const input_iterator input,
2255  const slot_iterator slots,
2256  output_iterator output)
2257 {
2259  n,
2260  input,
2261  slots,
2262  output );
2263 }
2264 
2265 // ------------------------------------------------------------------------------------------------------------- //
2266 
2269 void pack_flags(
2270  const uint32 n,
2271  const uint8* flags,
2272  uint32* comp_flags);
2273 
2276 void build_head_flags(
2277  const uint32 n,
2278  const uint32* keys,
2279  uint8* flags);
2280 
2283 void build_head_flags(
2284  const uint32 n,
2285  const uint64* keys,
2286  uint8* flags);
2287 
2288 } // namespace priv
2289 } // namespace nvbio