30 #include <cub/cub.cuh>
31 #include <mgpuhost.cuh>
32 #include <moderngpu.cuh>
39 #include <thrust/host_vector.h>
40 #include <thrust/device_vector.h>
41 #include <thrust/adjacent_difference.h>
42 #include <thrust/binary_search.h>
43 #include <thrust/iterator/constant_iterator.h>
45 #if defined(PLATFORM_X86)
46 #include <emmintrin.h>
104 const uint2* suffixes,
106 uint8* symbols = NULL);
114 const uint2* suffixes,
116 uint8* symbols = NULL);
120 template <
typename VectorType>
123 if (vec.size() < size)
132 log_error(stderr,
"alloc_storage() : allocation failed!\n");
140 template <
typename storage_type>
219 template <
typename string_set_type>
257 uint2
operator() (
const uint2 suffix)
const {
return make_uint2( suffix.x, suffix.y +
offset ); }
270 template <SuffixComponent COMP>
282 template <u
int32 WORD_BITS, u
int32 DOLLAR_BITS, u
int32 SYMBOL_SIZE,
typename string_type,
typename index_type>
285 const string_type
string,
286 const index_type string_len,
287 const index_type suffix_idx,
290 const uint32 SYMBOLS_PER_WORD =
uint32(WORD_BITS - DOLLAR_BITS)/SYMBOL_SIZE;
291 const uint32 SYMBOL_OFFSET =
uint32(WORD_BITS) - SYMBOL_SIZE;
294 for (
uint32 j = 0; j < SYMBOLS_PER_WORD; ++j)
296 const index_type jj = suffix_idx + w*SYMBOLS_PER_WORD + j;
297 const uint32 c = jj < string_len ?
string[jj] : 0u;
298 word |= (c << (SYMBOL_OFFSET - j*SYMBOL_SIZE));
304 const uint32 dollar_offset =
305 string_len <= suffix_idx + w*SYMBOLS_PER_WORD + SYMBOLS_PER_WORD ?
306 (string_len < suffix_idx + w*SYMBOLS_PER_WORD) ? 0u :
307 uint32(string_len - suffix_idx - w*SYMBOLS_PER_WORD) :
308 (1u << DOLLAR_BITS)-1u;
310 return word | dollar_offset;
318 template <u
int32 SYMBOL_SIZE, u
int32 WORD_BITS, u
int32 DOLLAR_BITS>
322 const uint32 SYMBOLS_PER_WORD = (WORD_BITS - DOLLAR_BITS)/SYMBOL_SIZE;
323 return SYMBOLS_PER_WORD;
326 template <u
int32 WORD_BITS, u
int32 DOLLAR_BITS, u
int32 SYMBOL_SIZE,
typename storage_type,
typename index_type,
typename sufindex_type>
329 const storage_type base_words,
330 const index_type string_len,
331 const index_type string_off,
332 const sufindex_type suffix_idx,
335 typedef typename std::iterator_traits<storage_type>::value_type word_type;
337 const uint32 STORAGE_BITS =
uint32( 8u *
sizeof(word_type) );
338 const uint32 STORAGE_SYMBOLS = STORAGE_BITS / SYMBOL_SIZE;
339 const uint32 SYMBOLS_PER_WORD =
uint32(WORD_BITS - DOLLAR_BITS)/SYMBOL_SIZE;
342 const sufindex_type suffix_off = suffix_idx + w*SYMBOLS_PER_WORD;
345 if (suffix_off >= string_len)
348 const index_type range_len = string_len - suffix_off;
349 const index_type range_off = string_off + suffix_off;
391 const uint32 m1 = range_off & (STORAGE_SYMBOLS-1);
392 const uint32 r1 = STORAGE_SYMBOLS - m1;
393 const word_type word1 = (base_words[ k1 ] << (m1*SYMBOL_SIZE));
395 word_type word = word1;
399 const word_type word2 = base_words[ k1+1u ];
400 word |= word2 >> (r1*SYMBOL_SIZE);
403 word >>= (STORAGE_BITS - WORD_BITS);
406 word &= clearmask<word_type>( WORD_BITS - n_symbols*SYMBOL_SIZE );
411 const word_type dollar_offset =
412 range_len <= SYMBOLS_PER_WORD ?
414 (1u << DOLLAR_BITS)-1u;
416 return word | dollar_offset;
422 template <u
int32 WORD_BITS, u
int32 DOLLAR_BITS, u
int32 SYMBOL_SIZE,
typename storage_type,
typename index_type,
typename sufindex_type,
typename output_iterator>
425 const storage_type base_words,
426 const index_type string_len,
427 const index_type string_off,
428 const sufindex_type suffix_idx,
431 output_iterator words)
433 typedef typename std::iterator_traits<storage_type>::value_type word_type;
435 const uint32 STORAGE_BITS =
uint32( 8u *
sizeof(word_type) );
436 const uint32 STORAGE_SYMBOLS = STORAGE_BITS / SYMBOL_SIZE;
437 const uint32 SYMBOLS_PER_WORD =
uint32(WORD_BITS - DOLLAR_BITS)/SYMBOL_SIZE;
440 sufindex_type suffix_off = suffix_idx + word_begin*SYMBOLS_PER_WORD;
442 index_type range_len = string_len - suffix_off;
443 index_type range_off = string_off + suffix_off;
445 const uint32 cache_begin =
uint32( range_off / STORAGE_SYMBOLS );
447 #if defined(PLATFORM_X86) && !defined(NVBIO_DEVICE_COMPILATION)
449 const uint32 SSE_WORDS = 16u /
sizeof( word_type );
450 const uint32 cache_end =
uint32( (range_off + (word_end - word_begin)*SYMBOLS_PER_WORD) / STORAGE_SYMBOLS );
452 __m128i sse_cache[8];
453 for (
uint32 w = cache_begin; w < cache_end; w += SSE_WORDS)
454 sse_cache[ (w - cache_begin)/SSE_WORDS ] = _mm_loadu_si128( (
const __m128i*)(base_words + w) );
456 const word_type* cached_words = (
const word_type*)sse_cache;
460 const storage_type cached_words = base_words + cache_begin;
463 for (
uint32 w = word_begin; w < word_end; ++w)
466 if (suffix_off >= string_len)
468 words[w - word_begin] = 0u;
509 const uint32 k1 =
uint32( range_off/STORAGE_SYMBOLS ) - cache_begin;
511 const uint32 m1 = range_off & (STORAGE_SYMBOLS-1);
512 const uint32 r1 = STORAGE_SYMBOLS - m1;
513 const word_type word1 = (cached_words[ k1 ] << (m1*SYMBOL_SIZE));
515 word_type word = word1;
519 const word_type word2 = cached_words[ k1+1u ];
520 word |= word2 >> (r1*SYMBOL_SIZE);
523 word >>= (STORAGE_BITS - WORD_BITS);
526 word &= clearmask<word_type>( WORD_BITS - n_symbols*SYMBOL_SIZE );
531 const word_type dollar_offset =
532 range_len <= SYMBOLS_PER_WORD ?
534 (1u << DOLLAR_BITS)-1u;
536 word |= dollar_offset;
540 words[ w - word_begin ] = word;
542 suffix_off += SYMBOLS_PER_WORD;
543 range_len -= SYMBOLS_PER_WORD;
544 range_off += SYMBOLS_PER_WORD;
569 const uint32 suffix_idx = global_suffix_idx - (string_idx ?
cum_lengths[ string_idx-1u ] : 0u);
581 template <u
int32 SYMBOL_SIZE, u
int32 WORD_BITS, u
int32 DOLLAR_BITS,
typename string_set_type,
typename word_type>
599 typedef typename string_set_type::string_type string_type;
601 const uint32 string_idx = local_suffix_idx.y;
602 const uint32 suffix_idx = local_suffix_idx.x;
604 const string_type
string =
string_set[string_idx];
605 const uint32 string_len =
string.length();
607 return result_type( extract_word_generic<WORD_BITS,DOLLAR_BITS,SYMBOL_SIZE>(
620 template <u
int32 SYMBOL_SIZE, u
int32 WORD_BITS, u
int32 DOLLAR_BITS,
typename storage_type,
typename word_type,
typename offsets_iterator>
622 SYMBOL_SIZE, WORD_BITS, DOLLAR_BITS,
628 typedef typename std::iterator_traits<offsets_iterator>::value_type
index_type;
650 const uint32 string_idx = local_suffix_idx.y;
651 const uint32 suffix_idx = local_suffix_idx.x;
657 const storage_type base_words =
string_set.base_string().stream();
659 return result_type( extract_word_packed<WORD_BITS,DOLLAR_BITS,SYMBOL_SIZE>(
673 template <u
int32 SYMBOL_SIZE, u
int32 WORD_BITS, u
int32 DOLLAR_BITS,
typename string_set_type,
typename word_type>
684 localizer( _cum_lengths, _string_ids ) {}
700 template <u
int32 SYMBOL_SIZE, u
int32 WORD_BITS, u
int32 DOLLAR_BITS,
typename string_type,
typename word_type>
719 return result_type( extract_word_generic<WORD_BITS,DOLLAR_BITS,SYMBOL_SIZE>(
733 template <u
int32 SYMBOL_SIZE, u
int32 WORD_BITS, u
int32 DOLLAR_BITS,
typename storage_type,
typename symbol_type,
typename index_type,
typename word_type>
735 SYMBOL_SIZE, WORD_BITS, DOLLAR_BITS,
736 PackedStream<storage_type,symbol_type,SYMBOL_SIZE,true,index_type>,
756 const storage_type base_words =
string.stream();
758 return result_type( extract_word_packed<WORD_BITS,DOLLAR_BITS,SYMBOL_SIZE>(
773 template <
typename string_type>
801 if (
string[suffix_idx1 + i] !=
string[suffix_idx2 + i])
814 template <u
int32 SYMBOL_SIZE,
typename string_type>
833 const uint32 WORD_BITS = 32u;
834 const uint32 DOLLAR_BITS = 4u;
835 const uint32 SYMBOLS_PER_WORD = symbols_per_word<SYMBOL_SIZE,WORD_BITS,DOLLAR_BITS>();
839 (
string_len - suffix_idx2) ) + SYMBOLS_PER_WORD-1 ) / SYMBOLS_PER_WORD;
842 for (
uint32 w = 0; w < n_words; ++w)
846 const uint32 w1 = word_functor( suffix_idx1 );
847 const uint32 w2 = word_functor( suffix_idx2 );
848 if (w1 < w2)
return true;
849 if (w1 > w2)
return false;
861 template <
typename string_type>
879 return suffix_idx ?
string[suffix_idx-1] : 255u;
889 template <
typename string_set_type>
906 typedef typename string_set_type::string_type string_type;
908 const uint32 string_idx = local_suffix_idx.y;
909 const uint32 suffix_idx = local_suffix_idx.x;
911 const string_type
string =
string_set[string_idx];
913 return suffix_idx ?
string[suffix_idx-1] : 255u;
921 typedef typename string_set_type::string_type string_type;
923 const string_type
string =
string_set[string_idx];
925 return string[
string.length()-1 ];
946 return (flag1 && flag2) ? 0u : 1u;
1024 template <u
int32 BITS, u
int32 DOLLAR_BITS>
1030 template <u
int32 SYMBOL_SIZE>
1058 return (n_strings + n_suffixes) *
sizeof(
uint32);
1064 template <
typename string_set_type>
1065 void set(
const string_set_type& string_set,
const bool empty_suffixes =
true)
1067 const uint32 n = string_set.size();
1080 thrust::plus<uint32>(),
1096 mgpu::SortedSearch<mgpu::MgpuBoundsLower>(
1097 thrust::make_counting_iterator<uint32>(0u),
1110 template <u
int32 BITS, u
int32 DOLLAR_BITS,
typename string_set_type,
typename index_iterator,
typename radix_iterator>
1112 const string_set_type& string_set,
1115 const index_iterator indices,
1116 radix_iterator radices)
1118 typedef typename std::iterator_traits<radix_iterator>::value_type word_type;
1133 template <
typename string_set_type>
1135 const string_set_type& string_set,
const bool empty_suffixes =
true)
1139 uint32( string_set.size() ),
1141 thrust::make_counting_iterator<uint32>(0u),
1143 thrust::maximum<uint32>(),
1149 template <
typename string_set_type,
typename index_iterator>
1151 const string_set_type& string_set,
1152 const index_iterator indices_begin,
1153 const index_iterator indices_end)
1157 return indices_end <= indices_begin ? 0u :
1159 indices_end - indices_begin,
1161 thrust::make_permutation_iterator(
string_ids.begin(), indices_begin ),
1163 thrust::maximum<uint32>(),
1192 template <u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename storage_type,
typename offsets_iterator,
typename input_tag,
typename output_tag>
1197 template <u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename storage_type,
typename offsets_iterator>
1201 typedef typename std::iterator_traits<storage_type>::value_type
word_type;
1202 typedef typename std::iterator_traits<offsets_iterator>::value_type
index_type;
1218 return (max_strings+1) *
sizeof(
uint32) +
1234 offsets_iterator> string_set,
1235 const uint32 chunk_begin,
1238 const uint32 chunk_size = chunk_end - chunk_begin;
1244 const uint64 begin_index = string_set.offsets()[ chunk_begin ];
1245 const uint64 end_index = string_set.offsets()[ chunk_end ];
1246 const uint64 begin_word = (begin_index / SYMBOLS_PER_WORD);
1247 const uint64 end_word = (end_index + SYMBOLS_PER_WORD-1) / SYMBOLS_PER_WORD;
1248 const uint32 chunk_words =
uint32( end_word - begin_word );
1250 const word_type* base_words = string_set.base_string().stream();
1256 base_words + begin_word,
1257 base_words + begin_word + chunk_words,
1258 d_chunk_string.begin() );
1261 uint32 chunk_symbols =
uint32( begin_index % SYMBOLS_PER_WORD );
1262 h_chunk_offsets[0] = chunk_symbols;
1263 for (
uint32 i = 0; i < chunk_size; ++i)
1265 chunk_symbols += string_set[ chunk_begin + i ].size();
1266 h_chunk_offsets[i+1] = chunk_symbols;
1271 h_chunk_offsets.begin(),
1272 h_chunk_offsets.begin() + chunk_size+1,
1273 d_chunk_offsets.begin() );
1279 d_packed_stream.begin(),
1290 template <u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename storage_type,
typename offsets_iterator,
typename system_tag>
1291 struct ChunkLoader<SYMBOL_SIZE,BIG_ENDIAN,storage_type,offsets_iterator,system_tag,system_tag>
1293 typedef typename std::iterator_traits<offsets_iterator>::value_type
index_type;
1303 const uint32 chunk_begin,
1308 uint32( chunk_end - chunk_begin ),
1310 string_set.
offsets() + chunk_begin );
1316 template <u
int32 SYMBOL_SIZE, u
int32 BITS, u
int32 DOLLAR_BITS,
typename string_type,
typename index_iterator,
typename radix_iterator>
1319 const string_type&
string,
1321 const index_iterator indices_begin,
1322 const index_iterator indices_end,
1323 radix_iterator radices)
1325 typedef typename std::iterator_traits<radix_iterator>::value_type word_type;
1339 template <u
int32 SYMBOL_SIZE, u
int32 N_BITS, u
int32 DOLLAR_BITS>
1349 template <
typename suffix_iterator,
typename string_type>
1352 const suffix_iterator suffixes,
1353 const uint32 string_length,
1354 const string_type&
string)
1358 const uint32 n_buckets = 1u << (N_BITS);
1368 flatten_string_suffixes<SYMBOL_SIZE, N_BITS,DOLLAR_BITS>(
1373 suffixes + n_suffixes,
1388 sort_enactor.
sort( n_suffixes, sort_buffers, 0u, N_BITS );
1404 thrust::make_counting_iterator<uint32>(0u),
1405 thrust::make_counting_iterator<uint32>(0u) + n_used_buckets,
1409 thrust::adjacent_difference(
1417 template <
typename suffix_iterator,
typename string_type,
typename bucketmap_iterator,
typename output_iterator>
1420 const suffix_iterator suffixes,
1421 const uint64 string_length,
1422 const string_type&
string,
1423 const uint32 bucket_begin,
1425 const bucketmap_iterator bucketmap,
1426 output_iterator output_radices,
1427 output_iterator output_indices)
1431 const uint32 n_buckets = 1u << N_BITS;
1441 flatten_string_suffixes<SYMBOL_SIZE,N_BITS,DOLLAR_BITS>(
1446 suffixes + n_suffixes,
1460 thrust::make_zip_iterator( thrust::make_tuple( suffixes,
d_radices.begin() ) ),
1462 thrust::make_zip_iterator( thrust::make_tuple(
d_indices.begin(),
d_radices.begin() ) ) + n_suffixes,
1473 d_radices.begin() + n_suffixes + n_collected,
1488 #if defined(SORT_BY_BUCKETS)
1493 sort_enactor.
sort( n_collected, sort_buffers, 0u, N_BITS );
1545 template <
typename string_set_type, u
int32 SYMBOL_SIZE, u
int32 DOLLAR_BITS, u
int32 WORD_BITS>
1554 const uint32 SYMBOLS_PER_WORD = priv::symbols_per_word<SYMBOL_SIZE,WORD_BITS,DOLLAR_BITS>();
1556 return (max_string_len + SYMBOLS_PER_WORD-1) / SYMBOLS_PER_WORD;
1563 return n_suffixes *
sizeof(
uint8) +
1564 n_suffixes *
sizeof(
uint32);
1577 m_block.resize( n_suffixes * block_size );
1581 log_error(stderr,
"HostStringSetRadices::reserve() : allocation failed!\n");
1588 void init(
const uint32 n_suffixes,
const uint2* _h_suffixes,
const uint2* _d_suffixes)
1591 d_suffixes = thrust::device_ptr<const uint2>( _d_suffixes );
1599 const uint32 word_block_begin,
1600 const uint32 word_block_end)
1604 if (d_indices == NULL)
1615 word_block_begin == 0 ? &
h_symbols[0] : NULL );
1621 thrust::device_ptr<const uint32>( d_indices ),
1622 thrust::device_ptr<const uint32>( d_indices ) + n_indices,
1645 log_error(stderr,
"HostStringSetRadices::init_slice() : exception caught!\n");
1663 const uint32 word_block_begin,
1664 const uint32 word_block_end,
1671 m_block.begin() + n_indices * (word_idx - word_block_begin),
1672 m_block.begin() + n_indices * (word_idx - word_block_begin) + n_indices,
1673 thrust::device_ptr<uint32>( d_radices ) );
1677 log_error(stderr,
"HostStringSetRadices::extract() : exception caught!\n");
1689 const int n_strings = int( end - begin );
1692 #pragma omp parallel for
1693 for (
int i = 0; i < n_strings; ++i)
1696 h_bwt[i] =
bwt( i + begin );
1710 if (d_indices != NULL)
1714 #pragma omp parallel for
1715 for (
int i = 0; i < n_suffixes; ++i)
1730 thrust::device_ptr<const uint32>( d_indices ),
1731 thrust::device_ptr<const uint32>( d_indices ) + n_suffixes,
1737 h_indices + n_suffixes,
1751 thrust::device_ptr<const uint32>( d_indices ),
1752 thrust::device_ptr<const uint32>( d_indices ) + n_suffixes,
1754 thrust::device_ptr<uint8>( d_bwt ) );
1758 thrust::device_ptr<uint8>( d_bwt ),
1759 thrust::device_ptr<uint8>( d_bwt ) + n_suffixes,
1766 #pragma omp parallel for
1767 for (
int i = 0; i < n_suffixes; ++i)
1777 thrust::device_ptr<uint8>( d_bwt ) );
1782 log_error(stderr,
"HostStringSetRadices::bwt() : exception caught!\n");
1801 m_block.size() *
sizeof(uint2) +
1819 template <
typename string_set_type, u
int32 SYMBOL_SIZE, u
int32 DOLLAR_BITS, u
int32 WORD_BITS>
1829 const uint32 SYMBOLS_PER_WORD = priv::symbols_per_word<SYMBOL_SIZE,WORD_BITS,DOLLAR_BITS>();
1831 return (max_string_len + SYMBOLS_PER_WORD-1) / SYMBOLS_PER_WORD;
1854 void init(
const uint32 n_suffixes,
const uint2* _h_suffixes,
const uint2* _d_suffixes)
1856 d_suffixes = thrust::device_ptr<const uint2>( _d_suffixes );
1864 const uint32 word_block_begin,
1865 const uint32 word_block_end) {}
1880 const uint32 word_block_begin,
1881 const uint32 word_block_end,
1887 if (d_indices == NULL)
1892 thrust::device_ptr<uint32>( d_radices ) );
1899 thrust::device_ptr<uint32>( d_radices ) );
1910 const int n_strings = end -
begin;
1916 thrust::make_counting_iterator<uint32>(begin),
1917 thrust::make_counting_iterator<uint32>(end),
1936 if (d_indices != NULL)
1949 thrust::device_ptr<const uint32>( d_indices ),
1950 thrust::device_ptr<const uint32>( d_indices ) + n_suffixes,
1952 thrust::device_ptr<uint8>( d_bwt ) );
1960 thrust::device_ptr<uint8>( d_bwt ),
1966 thrust::device_ptr<uint8>( d_bwt ),
1967 thrust::device_ptr<uint8>( d_bwt ) + n_suffixes,
2006 const uint2* h_suffixes,
2007 const uint2* d_suffixes,
2008 const uint32* d_indices);
2028 template <
typename input_iterator,
typename output_iterator,
typename index_type>
2031 const input_iterator input,
2032 output_iterator output,
2033 const index_type offset)
2043 template <
typename input_iterator,
typename storage_type, u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename index_type>
2046 const input_iterator input,
2048 const index_type offset)
2057 const uint32 SYMBOLS_PER_WORD = (8u*
sizeof(word_type))/SYMBOL_SIZE;
2059 const uint32 word_offset =
uint32( (offset + output.
index()) & (SYMBOLS_PER_WORD-1) );
2060 const uint32 elem_begin = thread_id ? (thread_id+0) * SYMBOLS_PER_WORD - word_offset : 0u;
2061 const uint32 elem_end =
nvbio::min( (thread_id+1) * SYMBOLS_PER_WORD - word_offset, n );
2065 for (
uint32 i = elem_begin; i < elem_end; ++i)
2066 output[offset+i] = input[i];
2072 template <
typename input_iterator,
typename output_iterator,
typename index_type>
2079 const input_iterator input,
2080 const output_iterator output,
2081 const index_type offset)
2084 for (
uint32 batch_begin = 0; batch_begin < n; batch_begin += batch_size)
2088 const uint32 blockdim = 128;
2090 simple_device_copy_kernel<<<n_blocks,blockdim>>>( n, input, output, offset );
2097 template <
typename input_iterator,
typename storage_type, u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename index_type>
2109 const input_iterator input,
2111 const index_type offset)
2114 const uint32 SYMBOLS_PER_WORD = (8u*
sizeof(word_type))/SYMBOL_SIZE;
2117 for (
uint32 batch_begin = 0; batch_begin < n; batch_begin += batch_size)
2121 const uint32 blockdim = 128;
2125 packed_device_copy_kernel<<<n_blocks,blockdim>>>( batch_end - batch_begin, input, output, offset + batch_begin );
2132 template <
typename input_iterator,
typename output_iterator,
typename index_type>
2135 const input_iterator input,
2136 const output_iterator output,
2137 const index_type offset)
2146 template <
typename input_iterator,
typename slot_iterator,
typename range_iterator,
typename storage_type, u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename index_type>
2150 const range_iterator ranges,
2151 const input_iterator input,
2152 const slot_iterator slots,
2165 const uint32 elem_begin = idx ? ranges[ idx-1 ] : 0u;
2166 const uint32 elem_end = ranges[ idx ];
2168 for (
uint32 i = elem_begin; i < elem_end; ++i)
2170 const uint32 slot = slots[i];
2171 output[ slot ] = input[i];
2178 template <
typename input_iterator,
typename slot_iterator,
typename output_iterator>
2183 const input_iterator input,
2184 const slot_iterator slots,
2185 output_iterator output)
2197 template <
typename input_iterator,
typename slot_iterator,
typename storage_type, u
int32 SYMBOL_SIZE,
bool BIG_ENDIAN,
typename index_type>
2207 const input_iterator input,
2208 const slot_iterator slots,
2214 thrust::device_vector<uint32> d_ranges( n );
2215 thrust::device_vector<uint32> d_keys( n );
2218 const uint32 SYMBOLS_PER_WORD = (8u*
sizeof(word_type))/SYMBOL_SIZE;
2223 thrust::make_counting_iterator<uint32>(1u),
2226 thrust::equal_to<uint32>(),
2227 thrust::maximum<uint32>() ).first - d_keys.begin() );
2230 for (
uint32 batch_begin = 0; batch_begin < n_ranges; batch_begin += batch_size)
2235 const uint32 blockdim = 128;
2238 device_scatter_kernel<<<n_blocks,blockdim>>>(
2251 template <
typename input_iterator,
typename slot_iterator,
typename output_iterator>
2254 const input_iterator input,
2255 const slot_iterator slots,
2256 output_iterator output)