30 #include <cub/cub.cuh>
31 #include <mgpuhost.cuh>
32 #include <moderngpu.cuh>
42 #include <thrust/merge.h>
59 if (
max_block_suffixes < _max_block_suffixes) cudaHostRegister( &
h_SA[0], _max_block_suffixes *
sizeof(
uint32), cudaHostRegisterPortable );
64 #if defined(QUICK_CHECK_REPORT) || defined(CHECK_SORTING)
74 template <u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename storage_type,
typename offsets_iterator>
77 string_sorter( mgpu_ctxt ),
85 insert_dollars_time = 0.0f;
90 n_processed_strings = 0u;
91 n_processed_suffixes = 0u;
96 template <u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename storage_type,
typename offsets_iterator>
99 const size_t d_bytes =
100 string_sorter.needed_device_memory( _max_block_suffixes )
101 + string_set_handler.needed_device_memory( max_block_suffixes )
102 + suffixes.needed_device_memory( _max_block_strings, _max_block_suffixes )
103 + chunk_loader.needed_device_memory( _max_block_strings, _max_block_suffixes )
104 + _max_block_suffixes *
sizeof(uint2)
105 + _max_block_suffixes *
sizeof(
uint8)
106 + _max_block_suffixes *
sizeof(
uint8)
107 + _max_block_strings *
sizeof(
uint32)
108 + _max_block_strings *
sizeof(
uint32);
115 template <u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename storage_type,
typename offsets_iterator>
118 max_block_suffixes = _max_block_suffixes;
119 max_block_strings = _max_block_strings;
121 const size_t d_bytes =
122 string_sorter.needed_device_memory( max_block_suffixes )
123 + string_set_handler.needed_device_memory( max_block_suffixes )
124 + suffixes.needed_device_memory( max_block_strings, max_block_suffixes )
125 + chunk_loader.needed_device_memory( _max_block_strings, _max_block_suffixes )
126 + max_block_suffixes *
sizeof(uint2)
127 + max_block_suffixes *
sizeof(
uint8)
128 + max_block_suffixes *
sizeof(
uint8)
129 + max_block_strings *
sizeof(
uint32)
130 + max_block_strings *
sizeof(
uint32);
132 log_verbose(stderr,
" allocating device sorting storage (%.1f GB)\n",
133 float( d_bytes ) /
float(1024*1024*1024) );
143 chunk_loader.reserve( max_block_strings, max_block_suffixes );
145 suffixes.reserve( max_block_strings, max_block_suffixes );
147 string_set_handler.reserve( max_block_suffixes, SORTING_SLICE_SIZE );
148 string_sorter.reserve( max_block_suffixes );
150 const size_t h_bytes =
151 max_block_suffixes *
sizeof(
uint64) * 2 +
152 max_block_suffixes *
sizeof(
uint32) +
153 max_block_strings *
sizeof(
uint32) +
154 max_block_suffixes *
sizeof(
uint8) +
155 max_block_strings *
sizeof(
uint32) +
156 max_block_strings *
sizeof(
uint32) +
157 max_block_strings *
sizeof(
uint64);
159 log_verbose(stderr,
" allocating host sorting storage (%.1f GB)\n",
160 float( h_bytes ) /
float(1024*1024*1024) );
165 block.reserve( max_block_strings, max_block_suffixes );
170 template <u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename storage_type,
typename offsets_iterator>
179 sort_block( block_begin, block_end, string_set, block );
193 template <u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename storage_type,
typename offsets_iterator>
203 n_suffixes_ext = BWT_ext.
size();
204 n_strings_ext = BWT_ext_dollars.
size();
239 #if defined(CHECK_COPY)
241 uint64 occ[SYMBOL_COUNT] = { 0 };
247 const uint32 cb = block.
h_BWT[i] & (SYMBOL_COUNT-1);
248 const uint32 ce = BWT_ext[i];
251 log_error(stderr,
"at %u, expected %u, got %u!\n", i, cb, ce);
256 for (
uint8 q = 0; q < SYMBOL_COUNT; ++q)
261 log_error(stderr,
"ranks mismatch at c[%u], i[%llu]:p[%llu]: expected %llu, got %llu!\n",
262 uint32(q), i, occ[q], r );
270 #if defined(CHECK_SORTING)
277 for (
uint32 i = 0; i < n_block_suffixes; ++i)
278 TSA[i] = suffix_localizer( block.
h_SA[i] );
289 template <u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename storage_type,
typename offsets_iterator>
296 block.
reserve( max_block_strings, max_block_suffixes );
298 const uint32 n_block_strings = block_end - block_begin;
300 uint32 n_block_suffixes = 0u;
301 for (
uint32 i = block_begin; i < block_end; ++i)
308 log_debug(stderr,
" load device chunk (%u)\n", n_block_suffixes);
313 const chunk_set_type chunk_set = chunk_loader.load( string_set, block_begin, block_end );
322 const uint32 SYMBOLS_PER_RADIXWORD = priv::symbols_per_word<SYMBOL_SIZE,RADIX_BITS,DOLLAR_BITS>();
324 log_debug(stderr,
" set suffix flattener\n");
329 suffixes.set( chunk_set );
332 if (suffixes.n_suffixes != n_block_suffixes)
334 log_error(stderr,
" unexpected number of suffixes! (%u)\n", suffixes.n_suffixes);
343 #if defined(HOST_STRING_IDS)
345 suffixes.string_ids.begin(),
346 suffixes.string_ids.begin() + n_block_suffixes,
351 suffixes.cum_lengths.begin(),
352 suffixes.cum_lengths.begin() + n_block_strings,
356 log_debug(stderr,
" localize suffixes\n");
366 nvbio::transform<device_tag>(
368 thrust::make_counting_iterator<uint32>(0),
377 thrust::device_ptr<uint8> d_unsorted_bwt = thrust::device_ptr<uint8>(
raw_pointer( d_temp_storage ) );
378 thrust::device_ptr<uint8> d_bwt = thrust::device_ptr<uint8>(
raw_pointer( d_BWT_block ) );
380 log_debug(stderr,
" compression sort\n");
392 string_set_handler.set( chunk_set );
402 thrust::make_counting_iterator<uint32>(0u),
406 SORTING_SLICE_SIZE );
413 nvbio::transform<device_tag>(
422 d_indices + n_block_suffixes,
435 d_indices + n_block_suffixes,
436 block.
h_SA.begin() );
441 d_bwt + n_block_suffixes,
442 block.
h_BWT.begin() );
447 thrust::make_zip_iterator( thrust::make_tuple(
448 thrust::make_counting_iterator<uint32>(0),
451 thrust::make_zip_iterator( thrust::make_tuple(
452 d_dollar_off.begin(),
453 d_dollar_id.begin() ) ),
456 if (n_dollars != n_block_strings)
458 log_error(stderr,
"mismatching number of dollars! expected %u, got %u\n", n_block_strings, n_dollars);
464 d_dollar_off.begin(),
465 d_dollar_off.begin() + n_block_strings,
471 d_dollar_id.begin() + n_block_strings,
476 log_verbose(stderr,
" sort : %.1f M suffixes/s (sort: %.1f%%, copy: %.1f%%)\n",
477 (1.0e-6f * (n_processed_suffixes + n_block_suffixes)) / (sort_time + copy_time),
478 100.0f * sort_time / (sort_time + copy_time),
479 100.0f * copy_time / (sort_time + copy_time));
484 template <u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename storage_type,
typename offsets_iterator>
488 const string_set_type string_set,
494 typedef typename string_set_type::string_type string_type;
528 C[0] = n_strings_ext;
529 for (
uint32 i = 1; i <= SYMBOL_COUNT; ++i)
531 C[i] = C[i-1] + freqs[i-1];
534 if ((255u % SYMBOL_COUNT) == i-1)
535 C[i] -= n_strings_ext;
541 const uint32 DOLLAR_SYMBOL = 255u & (SYMBOL_COUNT-1);
544 #pragma omp parallel for
545 for (
int32 q = 0; q <
int32( n_block_strings ); ++q)
547 const string_type
string = string_set[block_begin + q];
552 uint64 i = forward ? n_strings_ext : 0u;
556 g[suffix_off + len] = i;
559 for (
int32 k = len-1u; k >= 0; --k)
561 const uint8 c =
string[k];
563 const uint64 i_dollars = (c == DOLLAR_SYMBOL) ?
564 BWT_ext_dollars.
rank(i-1) : 0u;
567 const uint64 r = C[c] + BWT_ext.
rank( i-1, c );
568 const uint64 j = r - i_dollars;
569 assert( i_dollars <= r );
570 assert( j <= n_suffixes_ext );
573 g[suffix_off + k] = j;
576 if ((q == 243303 && k >= 56) ||
577 (q == 2163961 && k >= 56))
580 log_verbose(stderr,
" q[%8u:%2u] : LF[%9llu,%u] = %9llu (r[%9llu], $[%9llu]\n",
581 q, k, i,
uint32(c), j, r, i_dollars,
592 #pragma omp parallel for
593 for (
int32 i = 0; i <
int32( n_block_suffixes ); ++i)
594 g_sorted[i] = g[block.
h_SA[i]];
599 (1.0e-6f * (n_processed_suffixes + n_block_suffixes)) / rank_time);
601 #if defined(CHECK_SORTING)
605 const priv::localize_suffix_functor suffix_localizer(
610 typedef Suffix<input_string_type,uint32> suffix_type;
612 for (
uint32 i = 0; i < n_block_suffixes; ++i)
615 log_visible(stderr,
"\r check sorting... %6.2f%% ", 100.0f *
float(i)/
float(n_block_suffixes));
619 const uint2 suffix_i = suffix_localizer( sa );
621 if (g_sorted[i] < n_suffixes_ext)
623 const uint2 suffix_n = TSA[ g_sorted[i] ];
630 log_error(stderr,
"\ninsertion[%u -> %llu] : (%u,%u) > (%u,%u)\n",
632 suffix_i.y, suffix_i.x,
633 suffix_n.y, suffix_n.x );
635 const suffix_type string_i =
make_suffix( string_set[suffix_i.y], suffix_i.x );
636 const suffix_type string_n =
make_suffix( string_set[suffix_n.y], suffix_n.x );
641 const uint8 c_i = string_i[j];
649 const uint8 c_n = string_n[j];
659 const uint2 suffix_p = TSA[ g_sorted[i]-1 ];
666 log_error(stderr,
"\ninsertion[%u-1 -> %llu] : (%u,%u) < (%u, %u)\n",
669 suffix_i.y, suffix_i.x,
670 suffix_p.y, suffix_p.x );
672 const suffix_type string_i =
make_suffix( string_set[suffix_i.y], suffix_i.x );
673 const suffix_type string_p =
make_suffix( string_set[suffix_p.y], suffix_p.x );
678 const uint8 c_i = string_i[j];
686 const uint8 c_p = string_p[j];
695 log_visible(stderr,
"\r check sorting... done \n");
699 #if defined(QUICK_CHECK)
702 if (is_sorted<host_tag>( n_block_suffixes, &g_sorted[0] ) ==
false)
704 log_error(stderr,
" BWTE: suffix ranks not in sorted order!\n");
705 #if defined(QUICK_CHECK_REPORT)
706 for (
uint32 i = 0; i < n_block_suffixes-1; ++i)
708 if (g_sorted[i] > g_sorted[i+1])
710 log_error(stderr,
" g_s[%u->%u] = %llu > g_s[%u->%u] = %llu\n",
711 i, block.
h_SA[i], g_sorted[i],
712 i+1, block.
h_SA[i+1], g_sorted[i+1] );
715 const priv::localize_suffix_functor suffix_localizer(
719 const uint2 suffix1 = suffix_localizer( block.
h_SA[i] );
720 const uint2 suffix2 = suffix_localizer( block.
h_SA[i+1] );
723 log_error(stderr,
" (%02u,%8u): ", suffix1.x, suffix1.y);
725 const string_type
string = string_set[block_begin + suffix1.y];
730 log_error(stderr,
" (%02u,%8u): ", suffix2.x, suffix2.y);
732 const string_type
string = string_set[block_begin + suffix2.y];
748 template <u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename storage_type,
typename offsets_iterator>
749 void BWTEContext<SYMBOL_SIZE,BIG_ENDIAN,storage_type,offsets_iterator>::insert_block(
751 PagedText<SYMBOL_SIZE,BIG_ENDIAN>& BWT_ext,
752 SparseSymbolSet& BWT_ext_dollars)
754 const uint32 n_block_strings = block.n_strings;
755 const uint32 n_block_suffixes = block.n_suffixes;
757 #if defined(CHECK_INSERTION)
760 #pragma omp parallel for
761 for (
int32 i = 0; i <
int32( n_suffixes_ext ); ++i)
762 BWT_ext_copy[i] = BWT_ext[i];
767 ScopedTimer<float> timer( &insert_time );
770 BWT_ext.insert( n_block_suffixes, &g_sorted[0], &block.h_BWT[0] );
774 ScopedTimer<float> timer( &insert_dollars_time );
778 block.h_dollar_off.begin(),
779 block.h_dollar_off.begin() + n_block_strings,
781 block.h_dollar_pos.begin() );
784 BWT_ext_dollars.insert(
793 log_verbose(stderr,
" insert : %.1f M suffixes/s (symbols: %.1f%%, dollars: %.1f%%)\n",
794 (1.0e-6f * (n_processed_suffixes + n_block_suffixes)) / (insert_time + insert_dollars_time),
795 100.0f * insert_time / (insert_time + insert_dollars_time),
796 100.0f * insert_dollars_time / (insert_time + insert_dollars_time));
798 #if defined(CHECK_INSERTION)
800 uint64 occ[SYMBOL_COUNT] = { 0 };
805 for (
uint32 j = 0; j < n_block_suffixes; ++j)
808 log_visible(stderr,
"\r check insertions... %6.2f%% ", 100.0f *
float(j)/
float(n_block_suffixes));
810 const uint64 g_j = g_sorted[j];
814 if (BWT_ext[o] != BWT_ext_copy[p])
816 log_error(stderr,
"insertion mismatch at o[%llu]:p[%llu]: expected %u, got %u\n", o, p,
822 for (
uint8 q = 0; q < SYMBOL_COUNT; ++q)
824 const uint64 r = BWT_ext.rank( o, q );
827 log_error(stderr,
"ranks mismatch at c[%u], o[%llu]:p[%llu]: expected %llu, got %llu\n",
828 uint32(q), o, p, occ[q], r );
837 const uint32 cc = block.h_BWT[j] % SYMBOL_COUNT;
838 n_dollars += (block.h_BWT[j] == 255u) ? 1u : 0u;
840 if (BWT_ext[o] != cc)
842 const uint32 ce = BWT_ext[o];
843 log_error(stderr,
"insertion mismatch at o[%llu]:j[%u]:g[%llu]: expected %u, got %u (page[%u:%llu:%p])\n",
845 cc, ce, BWT_ext.find_page(o), BWT_ext.m_offsets[ BWT_ext.find_page(o) ], BWT_ext.get_page( BWT_ext.find_page(o) ));
849 for (
uint8 q = 0; q < SYMBOL_COUNT; ++q)
851 const uint64 r = BWT_ext.rank( o, q );
854 log_error(stderr,
"ranks mismatch at c[%u], o[%llu]:j[%llu]: expected %llu, got %llu\n",
855 uint32(q), o, p, occ[q], r );
861 while (p < n_suffixes_ext)
863 if (BWT_ext[o] != BWT_ext_copy[p])
865 log_error(stderr,
"insertion mismatch at o[%llu]:p[%llu]: expected %u, got %u\n", o, p,
870 for (
uint8 q = 0; q < SYMBOL_COUNT; ++q)
872 const uint64 r = BWT_ext.rank( o, q );
875 log_error(stderr,
"ranks mismatch at c[%u], o[%llu]:p[%llu]: expected %llu, got %llu\n",
876 uint32(q), o, p, occ[q], r );
895 template <u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename storage_type,
typename offsets_iterator>
898 PackedStream<storage_type,
uint8,SYMBOL_SIZE,BIG_ENDIAN,
typename std::iterator_traits<offsets_iterator>::value_type>,
899 offsets_iterator> string_set,
913 for (
uint32 i = 0; i < N; ++i)
915 const input_string_type
string = string_set[i];
918 bwt_size += len + 1u;
922 log_verbose(stderr,
" allocating host BWT storage (%.1f GB)\n",
926 BWT_ext_dollars.
reserve( bwt_size, N );
930 cudaGetDevice( ¤t_device );
933 const uint32 max_block_suffixes = 256*1024*1024;
934 const uint32 max_block_strings = 16*1024*1024;
936 bwte_context.
reserve( max_block_strings, max_block_suffixes );
938 const bool forward =
true;
941 for (
uint32 block_begin = forward ? 0 : N,
942 block_end = forward ? 0 : N;
943 forward ? (block_begin < N) :
947 uint32 n_block_suffixes = 0;
952 for (
uint32 i = block_begin; block_begin < N; ++i)
954 const input_string_type
string = string_set[i];
959 if (n_block_suffixes + string_len + 1u > max_block_suffixes ||
960 i - block_begin > max_block_strings)
963 n_block_suffixes += string_len + 1u;
970 for (
int32 i = block_end-1; i >= 0; --i)
972 const input_string_type
string = string_set[i];
977 if (n_block_suffixes + string_len + 1u > max_block_suffixes ||
978 block_end - i > max_block_strings)
981 n_block_suffixes += string_len + 1u;
986 log_info(stderr,
" block [%u, %u] (%u suffixes)\n", block_begin, block_end, n_block_suffixes);
997 block_begin = block_end;
999 block_end = block_begin;