30 #include <cub/cub.cuh>
31 #include <mgpuhost.cuh>
32 #include <moderngpu.cuh>
38 #include <thrust/host_vector.h>
39 #include <thrust/device_vector.h>
40 #include <thrust/adjacent_difference.h>
53 template <u
int32 SYMBOL_SIZE, u
int32 N_BITS, u
int32 DOLLAR_BITS,
typename bucket_type = u
int32>
90 template <
typename string_set_type>
99 const uint32 n_buckets = 1u << N_BITS;
113 thrust::make_counting_iterator<uint32>(0u),
128 sort_enactor.
sort( n_suffixes, sort_buffers, 0u, N_BITS );
142 mgpu::SortedSearch<mgpu::MgpuBoundsUpper>(
143 thrust::make_counting_iterator<uint32>(0u),
151 thrust::adjacent_difference(
164 template <
typename string_set_type,
typename bucketmap_iterator>
166 const string_set_type& string_set,
167 const uint32 bucket_begin,
169 const uint32 string_offset,
170 const bucketmap_iterator bucketmap)
183 const uint32 n_buckets = 1u << N_BITS;
197 thrust::make_counting_iterator<uint32>(0u),
211 thrust::make_zip_iterator( thrust::make_tuple( thrust::make_counting_iterator<uint32>(0u),
d_radices.begin() ) ),
213 thrust::make_zip_iterator( thrust::make_tuple(
d_indices.begin(),
d_radices.begin() ) ) + n_suffixes,
270 #if defined(SORT_BY_BUCKETS)
275 sort_enactor.
sort( n_collected, sort_buffers, 0u, N_BITS );
316 suffixes.allocated_device_memory() +
369 typename storage_type,
372 typename bucket_type,
398 template <
typename string_set_type>
400 const string_set_type& string_set,
401 thrust::host_vector<uint32>& h_buckets)
406 const uint32 chunk_size = 128*1024;
407 const uint32 n_strings = string_set.size();
411 const uint32 n_buckets = 1u << N_BITS;
423 for (
uint32 chunk_begin = 0; chunk_begin < n_strings; chunk_begin += chunk_size)
431 const chunk_set_type chunk_set = chunk_loader.load( string_set, chunk_begin, chunk_end );
447 thrust::plus<uint32>() );
479 template <
typename string_set_type>
481 const string_set_type& string_set,
482 const uint32 bucket_begin,
485 const thrust::host_vector<uint32>& h_subbuckets,
486 thrust::host_vector<uint2>& h_suffixes)
488 const uint32 chunk_size = 128*1024;
489 const uint32 n_strings = string_set.size();
501 for (
uint32 chunk_begin = 0; chunk_begin < n_strings; chunk_begin += chunk_size)
509 const chunk_set_type chunk_set = chunk_loader.load( string_set, chunk_begin, chunk_end );
532 if (n_collected +
m_bucketer.n_collected > h_suffixes.size())
534 log_error(stderr,
"buffer size exceeded! (%llu/%llu)\n", n_collected +
m_bucketer.n_collected,
uint64( h_suffixes.size() ));
551 "[%u] = (%u,%u) placed at %llu - %llu (%u)\n", i, loc.x, loc.y, slot,
m_global_offset, bucket );
628 template <u
int32 SYMBOL_SIZE, u
int32 N_BITS, u
int32 DOLLAR_BITS,
typename bucket_type = u
int32>
644 const uint32 n_buckets = 1u << N_BITS;
650 for (
uint32 i = 0; i < n_buckets; ++i)
660 template <
typename string_set_type>
663 typedef typename string_set_type::string_type string_type;
668 const uint32 n_strings = string_set.size();
673 for (
uint32 i = 0; i < n_strings; ++i)
675 const string_type
string = string_set[i];
679 for (
uint32 j = 0; j < string_len; ++j)
682 const bucket_type radix = word_functor( make_uint2( j, i ) );
689 _n_suffixes += string_len;
701 template <
typename string_set_type,
typename bucketmap_iterator>
703 const string_set_type& string_set,
704 const uint32 bucket_begin,
706 const uint32 string_offset,
707 const bucketmap_iterator bucketmap)
709 typedef typename string_set_type::string_type string_type;
714 const uint32 n_strings = string_set.size();
723 for (
uint32 i = 0; i < n_strings; ++i)
725 const string_type
string = string_set[i];
729 for (
uint32 j = 0; j < string_len; ++j)
732 const bucket_type radix = word_functor( make_uint2( j, i ) );
735 if (radix >= bucket_begin && radix < bucket_end)
793 typename storage_type,
796 typename bucket_type>
821 template <
typename string_set_type>
823 const string_set_type& string_set,
824 thrust::host_vector<uint32>& h_buckets)
830 const uint32 n_buckets = 1u << N_BITS;
831 const uint32 chunk_size = 128*1024;
832 const uint32 batch_size = n_threads * chunk_size;
833 const uint32 n_strings = string_set.size();
836 #pragma omp parallel for
837 for (
int b = 0; b < int(n_buckets); ++b)
847 for (
uint32 i = 0; i < n_threads; ++i)
851 for (
uint32 batch_begin = 0; batch_begin < n_strings; batch_begin += batch_size)
860 const uint32 chunk_begin = batch_begin + tid * chunk_size;
865 if (chunk_begin < batch_end)
867 const chunk_set_type chunk_set = chunk_loader.load( string_set, chunk_begin, chunk_end );
878 for (
uint32 i = 0; i < n_threads; ++i)
883 #pragma omp parallel for
884 for (
int b = 0; b < int(n_buckets); ++b)
908 template <
typename string_set_type>
910 const string_set_type& string_set,
911 const uint32 bucket_begin,
914 const thrust::host_vector<uint32>& h_subbuckets,
915 thrust::host_vector<uint2>& h_suffixes)
917 const uint32 batch_size = 1024*1024;
918 const uint32 n_strings = string_set.size();
930 for (
uint32 batch_begin = 0; batch_begin < n_strings; batch_begin += batch_size)
944 const uint32 chunk_begin = batch_begin + tid * chunk_size;
949 if (chunk_begin < batch_end)
951 const chunk_set_type chunk_set = chunk_loader.load( string_set, chunk_begin, chunk_end );
958 h_subbuckets.begin() );
966 for (
uint32 i = 0; i < n_threads; ++i)
968 const uint32 chunk_begin = batch_begin + i * chunk_size;
971 if (chunk_begin < batch_end)
980 if (n_collected +
m_bucketers[i].n_collected > h_suffixes.size())
982 log_error(stderr,
"buffer size exceeded! (%llu/%llu)\n", n_collected +
m_bucketers[i].n_collected,
uint64( h_suffixes.size() ));
996 "[%u:%u] = (%u,%u) placed at %llu - %llu (%u)\n", i, j, loc.x, loc.y, slot,
m_global_offset, bucket );
1045 std::vector< HostCoreSetSuffixBucketer<SYMBOL_SIZE,N_BITS,DOLLAR_BITS,bucket_type> >
m_bucketers;