NVBIO
|
This module contains a series of functions to perform suffix-sorting and for BWT construction of very large texts and text collections.
Modules | |
Set-BWTE | |
Compression Sort | |
Set-BWT Handlers | |
String Suffix Handlers | |
Namespaces | |
nvbio::cuda | |
Classes | |
struct | nvbio::BWTParams |
Functions | |
template<typename string_type , typename suffix_iterator , typename output_handler > | |
void | nvbio::cuda::blockwise_suffix_sort (const typename string_type::index_type string_len, string_type string, const typename string_type::index_type n_suffixes, suffix_iterator suffixes, output_handler &output, const DCS *dcs, BWTParams *params) |
template<typename string_type > | |
void | nvbio::cuda::blockwise_build (DCS &dcs, const typename string_type::index_type string_len, string_type string, BWTParams *params) |
SetBWTHandler * | nvbio::open_bwt_file (const char *output_name, const char *params) |
nvbio::BWTParams::BWTParams () | |
template<typename string_type > | |
string_type::index_type | nvbio::cuda::find_primary (const typename string_type::index_type string_len, const string_type string) |
template<typename string_type , typename output_iterator > | |
void | nvbio::cuda::suffix_sort (const typename stream_traits< string_type >::index_type string_len, const string_type string, output_iterator output, BWTParams *params) |
template<typename string_type , typename output_handler > | |
void | nvbio::cuda::blockwise_suffix_sort (const typename string_type::index_type string_len, string_type string, output_handler &output, BWTParams *params) |
template<typename string_type , typename output_iterator > | |
string_type::index_type | nvbio::cuda::bwt (const typename string_type::index_type string_len, string_type string, output_iterator output, BWTParams *params) |
template<typename string_set_type , typename output_handler > | |
void | nvbio::cuda::suffix_sort (const string_set_type &string_set, output_handler &output, BWTParams *params=NULL) |
template<uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type , typename output_handler > | |
void | nvbio::cuda::bwt (const ConcatenatedStringSet< PackedStream< storage_type, uint8, SYMBOL_SIZE, BIG_ENDIAN, uint64 >, uint64 * > string_set, output_handler &output, BWTParams *params=NULL) |
template<uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type , typename output_handler > | |
void | nvbio::large_bwt (const ConcatenatedStringSet< PackedStream< storage_type, uint8, SYMBOL_SIZE, BIG_ENDIAN, uint64 >, uint64 * > string_set, output_handler &output, BWTParams *params=NULL) |
Variables | |
uint64 | nvbio::BWTParams::host_memory |
uint64 | nvbio::BWTParams::device_memory |
uint32 | nvbio::BWTParams::bucketing_bits |
uint32 | nvbio::BWTParams::radix_slice |
uint32 | nvbio::BWTParams::cpu_bucketing |
void nvbio::cuda::blockwise_build | ( | DCS & | dcs, |
const typename string_type::index_type | string_len, | ||
string_type | string, | ||
BWTParams * | params | ||
) |
build the difference cover sample of a given string
Definition at line 455 of file blockwise_sufsort.h.
void nvbio::cuda::blockwise_suffix_sort | ( | const typename string_type::index_type | string_len, |
string_type | string, | ||
const typename string_type::index_type | n_suffixes, | ||
suffix_iterator | suffixes, | ||
output_handler & | output, | ||
const DCS * | dcs, | ||
BWTParams * | params | ||
) |
Sort a list of suffixes of a given string
Definition at line 59 of file blockwise_sufsort.h.
void nvbio::cuda::blockwise_suffix_sort | ( | const typename string_type::index_type | string_len, |
string_type | string, | ||
output_handler & | output, | ||
BWTParams * | params | ||
) |
Sort all the suffixes of a given string using an adaptation of the Blockwise Suffix Sorting algorithm by J.Kärkkäinen, and can hence work in a confined amount of host and device memory (as specified by BWTParams).
string_type | an iterator to the string |
output_handler | an handler for the sorted suffixes struct StringSuffixHandler
{
// process the next contiguous batch of suffixes
//
void process_batch(
const uint32 n_suffixes,
const uint32* d_suffixes);
// process a sparse set of suffixes; this method is required because sometimes,
// in order to achieve higher parallelism, the blockwise suffix sorter will
// delay the full sorting of a few <i>hard</i> suffixes in a block and resolve
// it at a later time (overwriting previously output indices)
//
void process_scattered(
const uint32 n_suffixes,
const uint32* d_suffixes,
const uint32* d_slots)
};
|
string_len | the length of the given string |
string | a device-side string |
output | the handler for the sorted suffixes |
params | construction parameters |
Definition at line 175 of file sufsort_inl.h.
string_type::index_type nvbio::cuda::bwt | ( | const typename string_type::index_type | string_len, |
string_type | string, | ||
output_iterator | output, | ||
BWTParams * | params | ||
) |
Compute the bwt of a device-side string. This function computes the bwt using an adaptation of the Blockwise Suffix Sorting by J.Kärkkäinen, and can hence work in a confined amount of host and device memory (as specified by BWTParams).
string_type | an iterator to the string |
output_iterator | an iterator for the output list of symbols |
string_len | the length of the given string |
string | a device-side string |
output | iterator to the output suffixes |
params | construction parameters |
Definition at line 239 of file sufsort_inl.h.
void nvbio::cuda::bwt | ( | const ConcatenatedStringSet< PackedStream< storage_type, uint8, SYMBOL_SIZE, BIG_ENDIAN, uint64 >, uint64 * > | string_set, |
output_handler & | output, | ||
BWTParams * | params = NULL |
||
) |
Build the bwt of a device-side string set
SYMBOL_SIZE | alphabet size, in bits per symbol |
storage_type | underlying storage iterator (e.g. uint32*) |
output_handler | an output handler, exposing the following interface: |
string_set | a device-side packed-concatenated string-set |
output | output handler |
params | construction parameters |
Definition at line 903 of file sufsort_inl.h.
string_type::index_type nvbio::cuda::find_primary | ( | const typename string_type::index_type | string_len, |
const string_type | string | ||
) |
return the position of the primary suffix of a string
Definition at line 59 of file sufsort_inl.h.
void nvbio::large_bwt | ( | const ConcatenatedStringSet< PackedStream< storage_type, uint8, SYMBOL_SIZE, BIG_ENDIAN, uint64 >, uint64 * > | string_set, |
output_handler & | output, | ||
BWTParams * | params = NULL |
||
) |
Build the bwt of a large host-side string set - the string set might not fit into GPU memory.
SYMBOL_SIZE | alphabet size, in bits per symbol |
storage_type | underlying storage iterator (e.g. uint32*) |
output_handler | an output handler, exposing the following interface: |
string_set | a host-side packed-concatenated string-set |
output | output handler |
params | construction parameters |
Definition at line 946 of file sufsort_inl.h.
open a string-set BWT file, returning a handler that can be used by the string-set BWT construction functions.
The file type is specified by the extension of the output name; the following extensions are supported:
.txt | ASCII |
.txt.gz | ASCII, gzip compressed |
.txt.bgz | ASCII, block-gzip compressed |
.bwt | 2-bit packed binary |
.bwt.gz | 2-bit packed binary, gzip compressed |
.bwt.bgz | 2-bit packed binary, block-gzip compressed |
.bwt4 | 4-bit packed binary |
.bwt4.gz | 4-bit packed binary, gzip compressed |
.bwt4.bgz | 4-bit packed binary, block-gzip compressed |
Alongside with the main BWT file, a file containing the mapping between the primary dollar tokens and their position in the BWT will be generated. This (.pri|.pri.gz|.pri.bgz) file is a plain list of (position,string-id) pairs, either in ASCII or binary form. The ASCII file has the form:
* PRI * position[1] string[1] * ... * position[n] string[n] *
The binary file has the form:
* char[4] header = "PRIB"; * struct { uint64 position; uint32 string_id; } pairs[n]; *
output_name | output name |
params | additional compression parameters (e.g. "1R", "9", etc) |
void nvbio::cuda::suffix_sort | ( | const typename stream_traits< string_type >::index_type | string_len, |
const string_type | string, | ||
output_iterator | output, | ||
BWTParams * | params | ||
) |
Sort all the suffixes of a given string. This function uses an adaptation of Larsson and Sadanake's algorithm, and requires roughly 16b of device memory per symbol.
string_type | an iterator to the string |
output_iterator | an iterator for the output list of suffixes |
string_len | the length of the given string |
string | a device-side string |
output | iterator to the output suffixes |
params | construction parameters |
Definition at line 149 of file sufsort_inl.h.
void nvbio::cuda::suffix_sort | ( | const string_set_type & | string_set, |
output_handler & | output, | ||
BWTParams * | params = NULL |
||
) |
Sort the suffixes of all the strings in the given string set
string_set_type | string-set type |
output_handler | an output handler, exposing the following interface: |
string_set | a device-side packed-concatenated string-set |
output | output handler |
params | construction parameters |
Definition at line 80 of file sufsort_inl.h.