NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Modules | Namespaces | Classes | Functions | Variables
Sufsort Module

Detailed Description

This module contains a series of functions to perform suffix-sorting and for BWT construction of very large texts and text collections.

Modules

 Set-BWTE
 
 Compression Sort
 
 Set-BWT Handlers
 
 String Suffix Handlers
 

Namespaces

 nvbio::cuda
 

Classes

struct  nvbio::BWTParams
 

Functions

template<typename string_type , typename suffix_iterator , typename output_handler >
void nvbio::cuda::blockwise_suffix_sort (const typename string_type::index_type string_len, string_type string, const typename string_type::index_type n_suffixes, suffix_iterator suffixes, output_handler &output, const DCS *dcs, BWTParams *params)
 
template<typename string_type >
void nvbio::cuda::blockwise_build (DCS &dcs, const typename string_type::index_type string_len, string_type string, BWTParams *params)
 
SetBWTHandler * nvbio::open_bwt_file (const char *output_name, const char *params)
 
 nvbio::BWTParams::BWTParams ()
 
template<typename string_type >
string_type::index_type nvbio::cuda::find_primary (const typename string_type::index_type string_len, const string_type string)
 
template<typename string_type , typename output_iterator >
void nvbio::cuda::suffix_sort (const typename stream_traits< string_type >::index_type string_len, const string_type string, output_iterator output, BWTParams *params)
 
template<typename string_type , typename output_handler >
void nvbio::cuda::blockwise_suffix_sort (const typename string_type::index_type string_len, string_type string, output_handler &output, BWTParams *params)
 
template<typename string_type , typename output_iterator >
string_type::index_type nvbio::cuda::bwt (const typename string_type::index_type string_len, string_type string, output_iterator output, BWTParams *params)
 
template<typename string_set_type , typename output_handler >
void nvbio::cuda::suffix_sort (const string_set_type &string_set, output_handler &output, BWTParams *params=NULL)
 
template<uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type , typename output_handler >
void nvbio::cuda::bwt (const ConcatenatedStringSet< PackedStream< storage_type, uint8, SYMBOL_SIZE, BIG_ENDIAN, uint64 >, uint64 * > string_set, output_handler &output, BWTParams *params=NULL)
 
template<uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type , typename output_handler >
void nvbio::large_bwt (const ConcatenatedStringSet< PackedStream< storage_type, uint8, SYMBOL_SIZE, BIG_ENDIAN, uint64 >, uint64 * > string_set, output_handler &output, BWTParams *params=NULL)
 

Variables

uint64 nvbio::BWTParams::host_memory
 
uint64 nvbio::BWTParams::device_memory
 
uint32 nvbio::BWTParams::bucketing_bits
 
uint32 nvbio::BWTParams::radix_slice
 
uint32 nvbio::BWTParams::cpu_bucketing
 

Function Documentation

template<typename string_type >
void nvbio::cuda::blockwise_build ( DCS &  dcs,
const typename string_type::index_type  string_len,
string_type  string,
BWTParams *  params 
)

build the difference cover sample of a given string

Definition at line 455 of file blockwise_sufsort.h.

template<typename string_type , typename suffix_iterator , typename output_handler >
void nvbio::cuda::blockwise_suffix_sort ( const typename string_type::index_type  string_len,
string_type  string,
const typename string_type::index_type  n_suffixes,
suffix_iterator  suffixes,
output_handler &  output,
const DCS *  dcs,
BWTParams *  params 
)

Sort a list of suffixes of a given string

Returns
position of the primary suffix / $ symbol

Definition at line 59 of file blockwise_sufsort.h.

template<typename string_type , typename output_handler >
void nvbio::cuda::blockwise_suffix_sort ( const typename string_type::index_type  string_len,
string_type  string,
output_handler &  output,
BWTParams *  params 
)

Sort all the suffixes of a given string using an adaptation of the Blockwise Suffix Sorting algorithm by J.Kärkkäinen, and can hence work in a confined amount of host and device memory (as specified by BWTParams).

Template Parameters
string_typean iterator to the string
output_handleran handler for the sorted suffixes
struct StringSuffixHandler
{
// process the next contiguous batch of suffixes
//
void process_batch(
const uint32 n_suffixes,
const uint32* d_suffixes);
// process a sparse set of suffixes; this method is required because sometimes,
// in order to achieve higher parallelism, the blockwise suffix sorter will
// delay the full sorting of a few <i>hard</i> suffixes in a block and resolve
// it at a later time (overwriting previously output indices)
//
void process_scattered(
const uint32 n_suffixes,
const uint32* d_suffixes,
const uint32* d_slots)
};
Parameters
string_lenthe length of the given string
stringa device-side string
outputthe handler for the sorted suffixes
paramsconstruction parameters

Definition at line 175 of file sufsort_inl.h.

template<typename string_type , typename output_iterator >
string_type::index_type nvbio::cuda::bwt ( const typename string_type::index_type  string_len,
string_type  string,
output_iterator  output,
BWTParams *  params 
)

Compute the bwt of a device-side string. This function computes the bwt using an adaptation of the Blockwise Suffix Sorting by J.Kärkkäinen, and can hence work in a confined amount of host and device memory (as specified by BWTParams).

Template Parameters
string_typean iterator to the string
output_iteratoran iterator for the output list of symbols
Parameters
string_lenthe length of the given string
stringa device-side string
outputiterator to the output suffixes
paramsconstruction parameters
Returns
position of the primary suffix / $ symbol

Definition at line 239 of file sufsort_inl.h.

template<uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type , typename output_handler >
void nvbio::cuda::bwt ( const ConcatenatedStringSet< PackedStream< storage_type, uint8, SYMBOL_SIZE, BIG_ENDIAN, uint64 >, uint64 * >  string_set,
output_handler &  output,
BWTParams *  params = NULL 
)

Build the bwt of a device-side string set

Template Parameters
SYMBOL_SIZEalphabet size, in bits per symbol
storage_typeunderlying storage iterator (e.g. uint32*)
output_handleran output handler, exposing the following interface:
struct SetBWTOutputHandler
{
// process a batch of BWT symbols
void process(
const uint32 n_suffixes, // number of sorted suffixes emitted
const uint8* h_bwt, // host-side BWT of the suffixes
const uint8* d_bwt, // device-side BWT of the suffixes
const uint2* h_suffixes, // host-side suffixes
const uint2* d_suffixes, // device-side suffixes
const uint32* d_indices); // device-side sorting index into the suffixes (possibly NULL)
};
Parameters
string_seta device-side packed-concatenated string-set
outputoutput handler
paramsconstruction parameters

Definition at line 903 of file sufsort_inl.h.

nvbio::BWTParams::BWTParams ( )
inline

Definition at line 94 of file sufsort.h.

template<typename string_type >
string_type::index_type nvbio::cuda::find_primary ( const typename string_type::index_type  string_len,
const string_type  string 
)

return the position of the primary suffix of a string

Definition at line 59 of file sufsort_inl.h.

template<uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type , typename output_handler >
void nvbio::large_bwt ( const ConcatenatedStringSet< PackedStream< storage_type, uint8, SYMBOL_SIZE, BIG_ENDIAN, uint64 >, uint64 * >  string_set,
output_handler &  output,
BWTParams *  params = NULL 
)

Build the bwt of a large host-side string set - the string set might not fit into GPU memory.

Template Parameters
SYMBOL_SIZEalphabet size, in bits per symbol
storage_typeunderlying storage iterator (e.g. uint32*)
output_handleran output handler, exposing the following interface:
struct LargeBWTOutputHandler
{
// process a batch of BWT symbols
void process(
const uint32 n_suffixes, // number of sorted suffixes emitted
const uint8* h_bwt, // host-side BWT of the suffixes
const uint8* d_bwt, // device-side BWT of the suffixes
const uint2* h_suffixes, // host-side suffixes
const uint2* d_suffixes, // device-side suffixes
const uint32* d_indices); // device-side sorting index into the suffixes (possibly NULL)
};
Parameters
string_seta host-side packed-concatenated string-set
outputoutput handler
paramsconstruction parameters

Definition at line 946 of file sufsort_inl.h.

SetBWTHandler* nvbio::open_bwt_file ( const char *  output_name,
const char *  params 
)

open a string-set BWT file, returning a handler that can be used by the string-set BWT construction functions.

The file type is specified by the extension of the output name; the following extensions are supported:

.txtASCII
.txt.gzASCII, gzip compressed
.txt.bgzASCII, block-gzip compressed
.bwt2-bit packed binary
.bwt.gz2-bit packed binary, gzip compressed
.bwt.bgz2-bit packed binary, block-gzip compressed
.bwt44-bit packed binary
.bwt4.gz4-bit packed binary, gzip compressed
.bwt4.bgz4-bit packed binary, block-gzip compressed

Alongside with the main BWT file, a file containing the mapping between the primary dollar tokens and their position in the BWT will be generated. This (.pri|.pri.gz|.pri.bgz) file is a plain list of (position,string-id) pairs, either in ASCII or binary form. The ASCII file has the form:

 * PRI
* position[1] string[1]
* ...
* position[n] string[n]
* 

The binary file has the form:

* char[4] header = "PRIB";
* struct { uint64 position; uint32 string_id; } pairs[n];
* 
Parameters
output_nameoutput name
paramsadditional compression parameters (e.g. "1R", "9", etc)
Returns
a handler that can be used by the string-set BWT construction functions
template<typename string_type , typename output_iterator >
void nvbio::cuda::suffix_sort ( const typename stream_traits< string_type >::index_type  string_len,
const string_type  string,
output_iterator  output,
BWTParams *  params 
)

Sort all the suffixes of a given string. This function uses an adaptation of Larsson and Sadanake's algorithm, and requires roughly 16b of device memory per symbol.

Template Parameters
string_typean iterator to the string
output_iteratoran iterator for the output list of suffixes
Parameters
string_lenthe length of the given string
stringa device-side string
outputiterator to the output suffixes
paramsconstruction parameters

Definition at line 149 of file sufsort_inl.h.

template<typename string_set_type , typename output_handler >
void nvbio::cuda::suffix_sort ( const string_set_type &  string_set,
output_handler &  output,
BWTParams *  params = NULL 
)

Sort the suffixes of all the strings in the given string set

Template Parameters
string_set_typestring-set type
output_handleran output handler, exposing the following interface:
struct SetSuffixOutputHandler
{
// process a batch of BWT symbols
void process(
const uint32 n_suffixes, // number of sorted suffixes emitted
const uint32* d_suffixes, // device-side array of sorted global suffix indices
const uint32* d_string_ids, // device-side array of the string ids bound to each suffix
const uint32* d_cum_lengths); // device-side array of cumulative string lengths
};
Parameters
string_seta device-side packed-concatenated string-set
outputoutput handler
paramsconstruction parameters

Definition at line 80 of file sufsort_inl.h.

Variable Documentation

uint32 nvbio::BWTParams::bucketing_bits

Definition at line 103 of file sufsort.h.

uint32 nvbio::BWTParams::cpu_bucketing

Definition at line 105 of file sufsort.h.

uint64 nvbio::BWTParams::device_memory

Definition at line 102 of file sufsort.h.

uint64 nvbio::BWTParams::host_memory

Definition at line 101 of file sufsort.h.

uint32 nvbio::BWTParams::radix_slice

Definition at line 104 of file sufsort.h.