35 #include <thrust/device_vector.h>
36 #include <thrust/transform_scan.h>
37 #include <thrust/binary_search.h>
38 #include <thrust/iterator/constant_iterator.h>
39 #include <thrust/iterator/counting_iterator.h>
40 #include <mgpuhost.cuh>
41 #include <moderngpu.cuh>
42 #include <cub/cub.cuh>
72 template <
typename string_type,
typename output_iterator>
75 const string_type
string,
76 output_iterator d_suffixes);
82 if (n > d_active_slots.size())
86 d_inv_keys.resize( n );
87 d_active_slots.resize( n + 4 );
88 d_sort_keys.resize( n + 4 );
89 d_sort_indices.resize( n + 4 );
90 d_partial_flags.resize( n + 16 );
91 d_temp_flags.resize( n + 16 );
92 d_segment_flags.resize( n + 16 );
93 d_segment_keys.resize( n + 4 );
94 d_segment_heads.resize( n + 4 );
95 d_segment_blocks.resize( ((n+4)+127/128) );
104 d_sort_indices.clear();
105 d_partial_flags.clear();
106 d_temp_flags.clear();
107 d_segment_flags.clear();
108 d_segment_keys.clear();
109 d_segment_heads.clear();
110 d_active_slots.clear();
111 d_segment_blocks.clear();
122 thrust::device_vector<uint32> d_inv_keys;
123 thrust::device_vector<uint32> d_sort_keys;
124 thrust::device_vector<uint32> d_sort_indices;
125 thrust::device_vector<uint32> d_active_slots;
126 thrust::device_vector<uint8> d_segment_flags;
127 thrust::device_vector<uint8> d_partial_flags;
128 thrust::device_vector<uint8> d_temp_flags;
129 thrust::device_vector<uint32> d_segment_keys;
130 thrust::device_vector<uint32> d_segment_heads;
131 thrust::device_vector<uint32> d_segment_blocks;
137 #define VECTORIZED_PREFIX_DOUBLING
138 #if defined(VECTORIZED_PREFIX_DOUBLING)
139 #define PREFIX_DOUBLING_VECTOR 4
141 #define PREFIX_DOUBLING_VECTOR 1
161 #if !defined(VECTORIZED_PREFIX_DOUBLING) // reference scalar implementation
162 const uint32 idx = threadIdx.x + blockIdx.x * blockDim.x;
166 const uint32 suf_nj = suffixes[idx] + j;
167 const uint32 K_nj = suf_nj < n_suffixes ? inv_keys_ldg[ suf_nj ] : 0u;
168 out_keys[idx] = K_nj;
169 #else // vectorized implementation
170 const uint32 idx = threadIdx.x + blockIdx.x * blockDim.x;
171 if (idx*4u >= n_slots)
174 uint4 suf_nj = ((
const uint4*)suffixes)[idx];
175 suf_nj = make_uint4( suf_nj.x + j, suf_nj.y + j, suf_nj.z + j, suf_nj.w + j );
177 const uint4 K_nj = make_uint4(
178 suf_nj.x < n_suffixes ? inv_keys_ldg[ suf_nj.x ] : 0u,
179 suf_nj.y < n_suffixes ? inv_keys_ldg[ suf_nj.y ] : 0u,
180 suf_nj.z < n_suffixes ? inv_keys_ldg[ suf_nj.z ] : 0u,
181 suf_nj.w < n_suffixes ? inv_keys_ldg[ suf_nj.w ] : 0u );
183 ((uint4*)out_keys)[idx] = K_nj;
196 const uint32 blockdim = 256;
198 prefix_doubling_kernel<<<n_blocks,blockdim>>>(
209 template <u
int32 BLOCKDIM>
217 const uint32 idx = thread_id * 4;
220 const uint32 key_p = idx && idx < n_flags ? keys[idx-1] : 0xFFFFFFFF;
223 const uint4 key = idx < n_flags ? ((
const uint4*)keys)[
thread_id] : make_uint4( 0, 0, 0, 0 );
226 uchar4 flag = idx < n_flags ? ((
const uchar4*)flags)[
thread_id] : make_uchar4( 0, 0, 0, 0 );
230 (key.x != key_p) ? 1u : flag.x,
231 (key.y != key.x) ? 1u : flag.y,
232 (key.z != key.y) ? 1u : flag.z,
233 (key.w != key.z) ? 1u : flag.w );
237 idx + 0u < n_flags ? flag.x : 0u,
238 idx + 1u < n_flags ? flag.y : 0u,
239 idx + 2u < n_flags ? flag.z : 0u,
240 idx + 3u < n_flags ? flag.w : 0u );
247 const uint32 n = flag.x + flag.y + flag.z + flag.w;
248 typedef cub::BlockReduce<uint32,BLOCKDIM> BlockReduce;
250 __shared__
typename BlockReduce::TempStorage reduce_smem_storage;
252 const uint32 block_aggregate = BlockReduce( reduce_smem_storage ).Sum( n );
254 if (threadIdx.x == 0)
255 blocks[blockIdx.x] = block_aggregate;
260 template <u
int32 BLOCKDIM>
270 const uint32 idx = thread_id * 4;
273 const uchar4 in_flag = idx < n_flags ? ((
const uchar4*)flags)[
thread_id] : make_uchar4( 0, 0, 0, 0 );
275 const uchar4 flag = make_uchar4(
276 idx + 0u < n_flags ? in_flag.x : 0u,
277 idx + 1u < n_flags ? in_flag.y : 0u,
278 idx + 2u < n_flags ? in_flag.z : 0u,
279 idx + 3u < n_flags ? in_flag.w : 0u );
282 const uint32 n = flag.x + flag.y + flag.z + flag.w;
287 typedef cub::BlockScan<uint32,BLOCKDIM> BlockScan;
289 __shared__
typename BlockScan::TempStorage scan_smem_storage;
291 BlockScan( scan_smem_storage ).ExclusiveSum( n, partial_scan, partial_aggregate );
294 partial_scan += blocks[ blockIdx.x ];
297 const uint4 key = make_uint4(
298 partial_scan +
uint32(flag.x),
299 partial_scan +
uint32(flag.x + flag.y),
300 partial_scan +
uint32(flag.x + flag.y + flag.z),
301 partial_scan +
uint32(flag.x + flag.y + flag.z + flag.w) );
310 const uint4 slot = ((
const uint4*)slots)[
thread_id];
312 if (flag.x) segments[key.x] = slot.x + 1u;
313 if (flag.y) segments[key.y] = slot.y + 1u;
314 if (flag.z) segments[key.z] = slot.z + 1u;
315 if (flag.w) segments[key.w] = slot.w + 1u;
339 const uint32 blockdim = 128;
340 const uint32 n_blocks = ((n_flags+3)/4 + blockdim-1) / blockdim;
343 build_head_flags_kernel<blockdim> <<<n_blocks,blockdim>>>(
349 *thrust::device_ptr<uint8>( flags + n_flags ) = 1u;
352 *thrust::device_ptr<uint32>( blocks + n_blocks ) = 0u;
354 thrust::device_ptr<uint32>( blocks ),
355 thrust::device_ptr<uint32>( blocks ) + n_blocks + 1u,
356 thrust::device_ptr<uint32>( blocks ),
358 thrust::plus<uint32>() );
361 extract_segments_kernel<blockdim> <<<n_blocks,blockdim>>>(
369 return *thrust::device_ptr<uint32>( blocks + n_blocks );
377 const uint8* stencil,
391 const uchar4 s = ((
const uchar4*)stencil)[
thread_id];
393 if (s.x + s.y + s.z + s.w == 0)
396 const uint4 key = ((
const uint4*)keys)[
thread_id];
397 const uchar4 flag = ((
const uchar4*)flags)[
thread_id];
398 const uint4 slot = ((
const uint4*)slots)[
thread_id];
399 const uint4 index = ((
const uint4*)indices)[
thread_id];
403 out_flags[ key.x - 1 ] = flag.x;
404 out_slots[ key.x - 1 ] = slot.x;
405 out_indices[ key.x - 1 ] = index.x;
409 out_flags[ key.y - 1 ] = flag.y;
410 out_slots[ key.y - 1 ] = slot.y;
411 out_indices[ key.y - 1 ] = index.y;
415 out_flags[ key.z - 1 ] = flag.z;
416 out_slots[ key.z - 1 ] = slot.z;
417 out_indices[ key.z - 1 ] = index.z;
421 out_flags[ key.w - 1 ] = flag.w;
422 out_slots[ key.w - 1 ] = slot.w;
423 out_indices[ key.w - 1 ] = index.w;
431 const uint8* stencil,
440 const uint32 blockdim = 128;
441 const uint32 n_words = (n + 3) / 4;
442 const uint32 n_blocks = (n_words + blockdim-1) / blockdim;
444 compact_kernel<<<n_blocks,blockdim>>>(
463 template <
typename string_type,
typename output_iterator>
466 const string_type
string,
467 output_iterator d_suffixes)
475 SYMBOL_SIZE*4 <= WORD_BITS-4 ? 4 :
476 SYMBOL_SIZE*3 <= WORD_BITS-2 ? 2 :
477 SYMBOL_SIZE*2 <= WORD_BITS-2 ? 2 :
480 const uint32 SYMBOLS_PER_WORD = priv::symbols_per_word<SYMBOL_SIZE,WORD_BITS,DOLLAR_BITS>();
482 const uint32 n_suffixes = string_len;
489 d_segment_flags.begin(),
490 d_segment_flags.begin() + n_suffixes,
495 thrust::make_counting_iterator<uint32>(0u),
496 thrust::make_counting_iterator<uint32>(0u) + n_suffixes,
497 d_sort_indices.begin() );
501 thrust::make_counting_iterator<uint32>(0u),
502 thrust::make_counting_iterator<uint32>(0u) + n_suffixes,
503 d_active_slots.begin() );
517 d_sort_keys.begin() );
535 sort_enactor.
sort( n_suffixes, sort_buffers );
540 d_sort_keys.swap( d_segment_keys );
541 d_sort_indices.swap( d_segment_heads );
576 thrust::make_permutation_iterator( d_segment_heads.begin(), d_segment_keys.begin() ),
577 thrust::make_permutation_iterator( d_segment_heads.begin(), d_segment_keys.begin() ) + n_suffixes,
578 d_sort_indices.begin(),
579 d_inv_keys.begin() );
587 uint32 n_active_suffixes = n_suffixes;
589 for (
uint32 j = SYMBOLS_PER_WORD; j < string_len; j *= 2u)
591 #define CSS_KEY_PRUNING
592 #if defined(CSS_KEY_PRUNING)
608 thrust::adjacent_difference(
609 d_segment_flags.begin(),
610 d_segment_flags.begin() + n_active_suffixes+1u,
611 d_partial_flags.begin() + 3u,
616 d_partial_flags.begin() + 4u,
617 d_partial_flags.begin() + 4u + n_active_suffixes,
618 d_segment_keys.begin(),
619 thrust::plus<uint32>() );
621 const uint32 n_partials = d_segment_keys[ n_active_suffixes-1u ];
627 if (3u*n_partials <= n_active_suffixes*2u)
637 d_sort_indices.begin(),
638 d_sort_indices.begin() + n_active_suffixes,
639 d_active_slots.begin(),
656 d_segment_flags.swap( d_temp_flags );
657 d_active_slots.swap( d_sort_keys );
658 d_sort_indices.swap( d_segment_heads );
661 n_active_suffixes = n_partials;
669 #endif // if defined(CSS_KEY_PRUNING)
708 mgpu::SegSortPairsFromFlags(
735 if (n_segments == n_active_suffixes)
739 d_sort_indices.begin(),
740 d_sort_indices.begin() + n_active_suffixes,
741 d_active_slots.begin(),
755 thrust::make_permutation_iterator( d_segment_heads.begin(), d_segment_keys.begin() ),
756 thrust::make_permutation_iterator( d_segment_heads.begin(), d_segment_keys.begin() ) + n_active_suffixes,
757 d_sort_indices.begin(),
758 d_inv_keys.begin() );