NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Namespaces | Classes | Enumerations | Functions
nvbio::cuda Namespace Reference

Namespaces

 wq
 

Classes

struct  Arch
 
struct  condition
 
struct  condition_set_view
 
struct  condition_set_storage
 
struct  host_device_buffer
 
struct  host_device_buffer_zero_copy
 
struct  host_device_buffer_sync
 
struct  ldg_pointer
 
struct  load_pointer
 
struct  PingPongQueuesView
 
struct  PingPongQueues
 
struct  is_sorted_iterator
 
struct  is_segment_sorted_iterator
 
struct  scan_dispatch
 
struct  scan_dispatch< T, Op, 32 >
 
struct  scan_dispatch< T, Op, 16 >
 
struct  scan_dispatch< T, Op, 8 >
 
struct  scan_dispatch< T, Op, 4 >
 
struct  scan_dispatch< T, Op, 2 >
 
struct  all_dispatch
 
struct  all_dispatch< 32 >
 
struct  all_dispatch< 2 >
 
struct  all_dispatch< 4 >
 
struct  all_dispatch< 8 >
 
struct  all_dispatch< 16 >
 
struct  all_dispatch< 64 >
 
struct  all_dispatch< 128 >
 
struct  all_dispatch< 256 >
 
struct  any_dispatch
 
struct  any_dispatch< 32 >
 
struct  any_dispatch< 2 >
 
struct  any_dispatch< 4 >
 
struct  any_dispatch< 8 >
 
struct  any_dispatch< 16 >
 
struct  any_dispatch< 64 >
 
struct  any_dispatch< 128 >
 
struct  any_dispatch< 256 >
 
struct  SortBuffers
 
struct  SortEnactor
 
struct  syncblocks
 
struct  syncblocks_storage
 
struct  Timer
 
struct  ScopedTimer
 
struct  InplaceQueueTag
 
struct  WorkQueueStatsView
 
struct  DefaultMover
 
struct  WorkQueue
 
struct  WorkQueueStats
 
struct  MultiPassQueueTag
 
struct  WorkQueue< MultiPassQueueTag, WorkUnitT, BLOCKDIM >
 
struct  OrderedQueueTag
 
struct  WorkQueue< OrderedQueueTag, WorkUnitT, BLOCKDIM >
 
struct  PersistentWarpsQueueTag
 
struct  PersistentThreadsQueueTag
 
struct  WorkQueue< PersistentWarpsQueueTag, WorkUnitT, BLOCKDIM >
 
struct  WorkQueue< PersistentThreadsQueueTag, WorkUnitT, BLOCKDIM >
 
struct  DelayList
 
struct  DiscardDelayList
 
struct  CompressionSort
 
struct  PrefixDoublingSufSort
 
struct  HostBWTConfigGPUBucketer
 
struct  HostBWTConfigCPUBucketer
 
struct  DeviceBWTConfig
 
struct  LargeBWTStatus
 
struct  LargeBWTSkeleton
 

Enumerations

enum  CacheLoadModifier {
  LOAD_DEFAULT, LOAD_CA, LOAD_CG, LOAD_CS,
  LOAD_CV, LOAD_LDG, LOAD_VOLATILE
}
 Enumeration of cache modifiers for memory load operations. More...
 
enum  WorkQueueStatsEvent { STREAM_EVENT = 0, RUN_EVENT = 1 }
 

Functions

void device_arch (uint32 &major, uint32 &minor)
 
uint32 max_grid_size ()
 
size_t multiprocessor_count ()
 
size_t smem_allocation_unit (const cudaDeviceProp &properties)
 
size_t reg_allocation_unit (const cudaDeviceProp &properties, const size_t regsPerThread)
 
size_t warp_allocation_multiple (const cudaDeviceProp &properties)
 
size_t num_sides_per_multiprocessor (const cudaDeviceProp &properties)
 
size_t max_blocks_per_multiprocessor (const cudaDeviceProp &properties)
 
size_t num_regs_per_block (const cudaDeviceProp &properties, const cudaFuncAttributes &attributes, const size_t CTA_SIZE)
 
template<typename KernelFunction >
cudaFuncAttributes function_attributes (KernelFunction kernel)
 
template<typename KernelFunction >
size_t max_active_blocks_per_multiprocessor (KernelFunction kernel, const size_t CTA_SIZE, const size_t dynamic_smem_bytes)
 
template<typename KernelFunction >
size_t max_active_blocks (KernelFunction kernel, const size_t CTA_SIZE, const size_t dynamic_smem_bytes)
 
template<typename KernelFunction >
size_t num_registers (KernelFunction kernel)
 
template<typename KernelFunction >
size_t max_blocksize_with_highest_occupancy (KernelFunction kernel, size_t dynamic_smem_bytes_per_thread)
 
bool is_tcc_enabled ()
 
void check_error (const char *message)
 
template<uint32 N>
NVBIO_FORCEINLINE
NVBIO_HOST_DEVICE void 
syncthreads ()
 
size_t max_active_blocks_per_multiprocessor (const cudaDeviceProp &properties, const cudaFuncAttributes &attributes, size_t CTA_SIZE, size_t dynamic_smem_bytes)
 
size_t max_blocksize_with_highest_occupancy (const cudaDeviceProp &properties, const cudaFuncAttributes &attributes, size_t dynamic_smem_bytes_per_thread)
 
template<typename T >
void copy (const thrust::device_vector< T > &dvec, thrust::host_vector< T > &hvec)
 
template<typename T >
void copy (host_device_buffer< T > &dvec, thrust::host_vector< T > &hvec)
 
template<typename T >
const T * device_pointer (const thrust::device_vector< T > &dvec)
 
template<typename T >
T * device_pointer (thrust::device_vector< T > &dvec)
 
template<typename T >
const T * device_pointer (const host_device_buffer< T > &dvec)
 
template<typename T >
T * device_pointer (host_device_buffer< T > &dvec)
 
template<typename T >
NVBIO_FORCEINLINE
NVBIO_HOST_DEVICE ldg_pointer
< T > 
make_ldg_pointer (const T *it)
 
template<CacheLoadModifier MOD, typename T >
NVBIO_FORCEINLINE
NVBIO_HOST_DEVICE load_pointer
< T, MOD
make_load_pointer (const T *it)
 
template<typename VectorType >
void alloc_temp_storage (VectorType &vec, const uint64 size)
 
template<typename PredicateIterator >
bool any (const uint32 n, const PredicateIterator pred)
 
template<typename PredicateIterator >
bool all (const uint32 n, const PredicateIterator pred)
 
template<typename Iterator >
bool is_sorted (const uint32 n, const Iterator values)
 
template<typename Iterator , typename Headflags >
bool is_segment_sorted (const uint32 n, const Iterator values, const Headflags flags)
 
template<typename InputIterator , typename BinaryOp >
std::iterator_traits
< InputIterator >::value_type 
reduce (const uint32 n, InputIterator d_in, BinaryOp op, thrust::device_vector< uint8 > &d_temp_storage)
 
template<typename InputIterator , typename OutputIterator , typename BinaryOp >
void inclusive_scan (const uint32 n, InputIterator d_in, OutputIterator d_out, BinaryOp op, thrust::device_vector< uint8 > &d_temp_storage)
 
template<typename InputIterator , typename OutputIterator , typename BinaryOp , typename Identity >
void exclusive_scan (const uint32 n, InputIterator d_in, OutputIterator d_out, BinaryOp op, Identity identity, thrust::device_vector< uint8 > &d_temp_storage)
 
template<typename InputIterator , typename FlagsIterator , typename OutputIterator >
uint32 copy_flagged (const uint32 n, InputIterator d_in, FlagsIterator d_flags, OutputIterator d_out, thrust::device_vector< uint8 > &d_temp_storage)
 
template<typename InputIterator , typename OutputIterator , typename Predicate >
uint32 copy_if (const uint32 n, InputIterator d_in, OutputIterator d_out, const Predicate pred, thrust::device_vector< uint8 > &d_temp_storage)
 
template<typename InputIterator , typename OutputIterator , typename CountIterator >
uint32 runlength_encode (const uint32 n, InputIterator d_in, OutputIterator d_out, CountIterator d_counts, thrust::device_vector< uint8 > &d_temp_storage)
 
template<typename KeyIterator , typename ValueIterator , typename OutputKeyIterator , typename OutputValueIterator , typename ReductionOp >
uint32 reduce_by_key (const uint32 n, KeyIterator d_keys_in, ValueIterator d_values_in, OutputKeyIterator d_keys_out, OutputValueIterator d_values_out, ReductionOp reduction_op, thrust::device_vector< uint8 > &d_temp_storage)
 
template<typename PredicateIterator >
__global__ void any_kernel (const uint32 n, const PredicateIterator pred, uint32 *r)
 
template<typename PredicateIterator >
__global__ void all_kernel (const uint32 n, const PredicateIterator pred, uint32 *r)
 
template<typename T >
__device__ __forceinline__ T bit_scan (bool p)
 
template<typename T >
__device__ __forceinline__ T scan_warp (T val, const int32 tidx, volatile T *red)
 
template<typename T >
__device__ __forceinline__ T scan_warp_total (volatile T *red)
 
template<uint32 COUNT, typename T , typename Op >
__device__ __forceinline__ T scan (T val, const Op op, const T init, volatile T *red)
 
template<uint32 COUNT, typename T >
__device__ __forceinline__ T scan (T val, volatile T *red)
 
template<uint32 COUNT, typename T >
__device__ __forceinline__ T scan_total (volatile T *red)
 
__device__ __forceinline__ uint32 alloc (uint32 n, uint32 *pool, const int32 warp_tid, volatile uint32 *warp_red, volatile uint32 *warp_broadcast)
 
template<uint32 N>
__device__ __forceinline__ uint32 alloc (bool pred, uint32 *pool, const int32 warp_tid, volatile uint32 *warp_broadcast)
 
template<uint32 COUNT>
__device__ __forceinline__ bool all (const bool p, volatile uint8 *sm=NULL)
 
template<uint32 COUNT>
__device__ __forceinline__ bool any (const bool p, volatile uint8 *sm=NULL)
 
WorkQueueStatsView view (WorkQueueStats *stats)
 
template<typename InStringSet , typename StringIterator , typename OffsetIterator >
void copy (const InStringSet &in_string_set, ConcatenatedStringSet< StringIterator, OffsetIterator > &out_string_set)
 
template<typename InStringSet , typename StringIterator , typename LengthIterator >
void copy (const InStringSet &in_string_set, StridedStringSet< StringIterator, LengthIterator > &out_string_set)
 
template<typename InStringSet , typename StreamIterator , typename SymbolType , uint32 SYMBOL_SIZE_T, bool BIG_ENDIAN_T, typename LengthIterator >
void copy (const InStringSet &in_string_set, StridedPackedStringSet< StreamIterator, SymbolType, SYMBOL_SIZE_T, BIG_ENDIAN_T, LengthIterator > &out_string_set)
 
template<typename string_type , typename suffix_iterator , typename output_handler >
void blockwise_suffix_sort (const typename string_type::index_type string_len, string_type string, const typename string_type::index_type n_suffixes, suffix_iterator suffixes, output_handler &output, const DCS *dcs, BWTParams *params)
 
template<typename string_type >
void blockwise_build (DCS &dcs, const typename string_type::index_type string_len, string_type string, BWTParams *params)
 
__global__ void prefix_doubling_kernel (const uint32 n_slots, const uint32 n_suffixes, const uint32 j, const uint32 *suffixes, const uint32 *inv_keys, uint32 *out_keys)
 
void prefix_doubling (const uint32 n_slots, const uint32 n_suffixes, const uint32 j, const uint32 *suffixes, const uint32 *inv_keys, uint32 *out_keys)
 
template<uint32 BLOCKDIM>
__global__ void build_head_flags_kernel (const uint32 n_flags, const uint32 *keys, uint8 *flags, uint32 *blocks)
 
template<uint32 BLOCKDIM>
__global__ void extract_segments_kernel (const uint32 n_flags, const uint8 *flags, const uint32 *blocks, const uint32 *slots, uint32 *keys, uint32 *segments)
 
uint32 extract_segments (const uint32 n_flags, const uint32 *in_keys, uint8 *flags, uint32 *blocks, const uint32 *slots, uint32 *keys, uint32 *segments)
 
__global__ void compact_kernel (const uint32 n, const uint8 *stencil, const uint32 *keys, const uint8 *flags, const uint32 *slots, const uint32 *indices, uint8 *out_flags, uint32 *out_slots, uint32 *out_indices)
 
void compact (const uint32 n, const uint8 *stencil, const uint32 *keys, const uint8 *flags, const uint32 *slots, const uint32 *indices, uint8 *out_flags, uint32 *out_slots, uint32 *out_indices)
 
template<typename string_type >
string_type::index_type find_primary (const typename string_type::index_type string_len, const string_type string)
 
template<typename string_type , typename output_iterator >
void suffix_sort (const typename stream_traits< string_type >::index_type string_len, const string_type string, output_iterator output, BWTParams *params)
 
template<typename string_type , typename output_handler >
void blockwise_suffix_sort (const typename string_type::index_type string_len, string_type string, output_handler &output, BWTParams *params)
 
template<typename string_type , typename output_iterator >
string_type::index_type bwt (const typename string_type::index_type string_len, string_type string, output_iterator output, BWTParams *params)
 
template<typename string_set_type , typename output_handler >
void suffix_sort (const string_set_type &string_set, output_handler &output, BWTParams *params=NULL)
 
template<uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type , typename output_handler >
void bwt (const ConcatenatedStringSet< PackedStream< storage_type, uint8, SYMBOL_SIZE, BIG_ENDIAN, uint64 >, uint64 * > string_set, output_handler &output, BWTParams *params=NULL)
 
void scan_test ()
 

Detailed Description

Strings

Enumeration Type Documentation

Enumeration of cache modifiers for memory load operations.

Enumerator
LOAD_DEFAULT 

Default (no modifier)

LOAD_CA 

Cache at all levels.

LOAD_CG 

Cache at global level.

LOAD_CS 

Cache streaming (likely to be accessed once)

LOAD_CV 

Cache as volatile (including cached system lines)

LOAD_LDG 

Cache as texture.

LOAD_VOLATILE 

Volatile (any memory space)

Definition at line 197 of file ldg.h.

Function Documentation

template<uint32 COUNT>
__device__ __forceinline__ bool nvbio::cuda::all ( const bool  p,
volatile uint8 *  sm = NULL 
)

generalized all primitive for arbitrarily (power of 2) sized thread groups

Template Parameters
COUNTthread-group size
Parameters
pper-thread predicate
smshared-memory array needed when COUNT is larger than 32
Returns
true iff the predicate is true for all threads in the thread-group

Definition at line 394 of file scan.h.

template<typename PredicateIterator >
__global__ void nvbio::cuda::all_kernel ( const uint32  n,
const PredicateIterator  pred,
uint32 *  r 
)

Definition at line 79 of file primitives_inl.h.

__device__ __forceinline__ uint32 nvbio::cuda::alloc ( uint32  n,
uint32 *  pool,
const int32  warp_tid,
volatile uint32 *  warp_red,
volatile uint32 *  warp_broadcast 
)

alloc n elements per thread from a common pool, using a synchronous warp scan

Parameters
nnumber of elements to alloc
warp_tidwarp thread index
warp_redtemporary warp scan storage (2*WARP_SIZE elements)
warp_broadcasttemporary warp broadcasting storage

Definition at line 245 of file scan.h.

template<uint32 N>
__device__ __forceinline__ uint32 nvbio::cuda::alloc ( bool  pred,
uint32 *  pool,
const int32  warp_tid,
volatile uint32 *  warp_broadcast 
)

alloc zero or exactly N elements per thread from a common pool

Parameters
pallocation predicate
warp_tidwarp thread id
warp_broadcasttemporary warp broadcasting storage

Definition at line 262 of file scan.h.

template<uint32 COUNT>
__device__ __forceinline__ bool nvbio::cuda::any ( const bool  p,
volatile uint8 *  sm = NULL 
)

generalized any primitive for arbitrarily (power of 2) sized thread groups

Template Parameters
COUNTthread-group size
Parameters
pper-thread predicate
smshared-memory array needed when COUNT is larger than 32
Returns
true iff the predicate is true for any threads in the thread-group

Definition at line 517 of file scan.h.

template<typename PredicateIterator >
__global__ void nvbio::cuda::any_kernel ( const uint32  n,
const PredicateIterator  pred,
uint32 *  r 
)

Definition at line 59 of file primitives_inl.h.

template<typename T >
__device__ __forceinline__ T nvbio::cuda::bit_scan ( bool  p)

intra-warp inclusive scan

Parameters
valper-threrad input value

Definition at line 43 of file scan.h.

template<uint32 BLOCKDIM>
__global__ void nvbio::cuda::build_head_flags_kernel ( const uint32  n_flags,
const uint32 *  keys,
uint8 *  flags,
uint32 *  blocks 
)

build a set of head flags looking at adjacent keys and compute their block sums

Definition at line 210 of file prefix_doubling_sufsort.h.

void nvbio::cuda::check_error ( const char *  message)
inline

Definition at line 263 of file arch_inl.h.

void nvbio::cuda::compact ( const uint32  n,
const uint8 *  stencil,
const uint32 *  keys,
const uint8 *  flags,
const uint32 *  slots,
const uint32 *  indices,
uint8 *  out_flags,
uint32 *  out_slots,
uint32 *  out_indices 
)
inline

scatter flags/slots/indices to the position specified by keys (-1) only if the given stencil is true

Definition at line 429 of file prefix_doubling_sufsort.h.

__global__ void nvbio::cuda::compact_kernel ( const uint32  n,
const uint8 *  stencil,
const uint32 *  keys,
const uint8 *  flags,
const uint32 *  slots,
const uint32 *  indices,
uint8 *  out_flags,
uint32 *  out_slots,
uint32 *  out_indices 
)

scatter flags/slots/indices to the position specified by keys (-1) only if the given stencil is true

Definition at line 375 of file prefix_doubling_sufsort.h.

template<typename T >
void nvbio::cuda::copy ( const thrust::device_vector< T > &  dvec,
thrust::host_vector< T > &  hvec 
)

Definition at line 155 of file host_device_buffer.h.

template<typename T >
void nvbio::cuda::copy ( host_device_buffer< T > &  dvec,
thrust::host_vector< T > &  hvec 
)

Definition at line 163 of file host_device_buffer.h.

void nvbio::cuda::device_arch ( uint32 &  major,
uint32 &  minor 
)
inline

Definition at line 32 of file arch_inl.h.

template<typename T >
const T* nvbio::cuda::device_pointer ( const thrust::device_vector< T > &  dvec)

Definition at line 176 of file host_device_buffer.h.

template<typename T >
T* nvbio::cuda::device_pointer ( thrust::device_vector< T > &  dvec)

Definition at line 182 of file host_device_buffer.h.

template<typename T >
const T* nvbio::cuda::device_pointer ( const host_device_buffer< T > &  dvec)

Definition at line 188 of file host_device_buffer.h.

template<typename T >
T* nvbio::cuda::device_pointer ( host_device_buffer< T > &  dvec)

Definition at line 194 of file host_device_buffer.h.

uint32 nvbio::cuda::extract_segments ( const uint32  n_flags,
const uint32 *  in_keys,
uint8 *  flags,
uint32 *  blocks,
const uint32 *  slots,
uint32 *  keys,
uint32 *  segments 
)
inline

given a sorted set of n input keys, flags and slots, this function:

  • updates the set of flags or'ing them with the head flags of the segments of equal keys
  • computes a new "compact" set of keys in the range [1,#distinct-keys]
  • computes a set of segment heads segments[1,...,#distinct-keys]: segments[i] := slot[m] + 1, where m is the smallest integer such that keys[m] = i.

Definition at line 330 of file prefix_doubling_sufsort.h.

template<uint32 BLOCKDIM>
__global__ void nvbio::cuda::extract_segments_kernel ( const uint32  n_flags,
const uint8 *  flags,
const uint32 *  blocks,
const uint32 *  slots,
uint32 *  keys,
uint32 *  segments 
)

extract the slots corresponding to the first key in each segment

Definition at line 261 of file prefix_doubling_sufsort.h.

template<typename KernelFunction >
cudaFuncAttributes nvbio::cuda::function_attributes ( KernelFunction  kernel)
inline

Definition at line 101 of file arch_inl.h.

bool nvbio::cuda::is_tcc_enabled ( )
inline

Definition at line 254 of file arch_inl.h.

template<typename T >
NVBIO_FORCEINLINE NVBIO_HOST_DEVICE ldg_pointer<T> nvbio::cuda::make_ldg_pointer ( const T *  it)

make a ldg_pointer

Definition at line 188 of file ldg.h.

template<CacheLoadModifier MOD, typename T >
NVBIO_FORCEINLINE NVBIO_HOST_DEVICE load_pointer<T,MOD> nvbio::cuda::make_load_pointer ( const T *  it)

make a load_pointer

Definition at line 367 of file ldg.h.

template<typename KernelFunction >
size_t nvbio::cuda::max_active_blocks ( KernelFunction  kernel,
const size_t  CTA_SIZE,
const size_t  dynamic_smem_bytes 
)

Definition at line 193 of file arch_inl.h.

template<typename KernelFunction >
size_t nvbio::cuda::max_active_blocks_per_multiprocessor ( KernelFunction  kernel,
const size_t  CTA_SIZE,
const size_t  dynamic_smem_bytes 
)

Definition at line 179 of file arch_inl.h.

size_t nvbio::cuda::max_active_blocks_per_multiprocessor ( const cudaDeviceProp &  properties,
const cudaFuncAttributes &  attributes,
size_t  CTA_SIZE,
size_t  dynamic_smem_bytes 
)
inline

Definition at line 154 of file arch_inl.h.

size_t nvbio::cuda::max_blocks_per_multiprocessor ( const cudaDeviceProp &  properties)
inline

Definition at line 116 of file arch_inl.h.

template<typename KernelFunction >
size_t nvbio::cuda::max_blocksize_with_highest_occupancy ( KernelFunction  kernel,
size_t  dynamic_smem_bytes_per_thread 
)

Definition at line 242 of file arch_inl.h.

size_t nvbio::cuda::max_blocksize_with_highest_occupancy ( const cudaDeviceProp &  properties,
const cudaFuncAttributes &  attributes,
size_t  dynamic_smem_bytes_per_thread 
)
inline

Definition at line 213 of file arch_inl.h.

uint32 nvbio::cuda::max_grid_size ( )
inline

Definition at line 44 of file arch_inl.h.

size_t nvbio::cuda::multiprocessor_count ( )
inline

Definition at line 52 of file arch_inl.h.

template<typename KernelFunction >
size_t nvbio::cuda::num_registers ( KernelFunction  kernel)

Definition at line 207 of file arch_inl.h.

size_t nvbio::cuda::num_regs_per_block ( const cudaDeviceProp &  properties,
const cudaFuncAttributes &  attributes,
const size_t  CTA_SIZE 
)
inline

Definition at line 134 of file arch_inl.h.

size_t nvbio::cuda::num_sides_per_multiprocessor ( const cudaDeviceProp &  properties)
inline

Definition at line 122 of file arch_inl.h.

void nvbio::cuda::prefix_doubling ( const uint32  n_slots,
const uint32  n_suffixes,
const uint32  j,
const uint32 *  suffixes,
const uint32 *  inv_keys,
uint32 *  out_keys 
)
inline

perform prefix-doubling on the selected suffixes

Definition at line 188 of file prefix_doubling_sufsort.h.

__global__ void nvbio::cuda::prefix_doubling_kernel ( const uint32  n_slots,
const uint32  n_suffixes,
const uint32  j,
const uint32 *  suffixes,
const uint32 *  inv_keys,
uint32 *  out_keys 
)

Definition at line 143 of file prefix_doubling_sufsort.h.

size_t nvbio::cuda::reg_allocation_unit ( const cudaDeviceProp &  properties,
const size_t  regsPerThread 
)
inline

Definition at line 70 of file arch_inl.h.

template<uint32 COUNT, typename T , typename Op >
__device__ __forceinline__ T nvbio::cuda::scan ( val,
const Op  op,
const init,
volatile T *  red 
)

intra-warp inclusive scan

Parameters
valper-threrad input value
tidxwarp thread index
redscan result storage (2*WARP_SIZE elements)

Definition at line 216 of file scan.h.

template<uint32 COUNT, typename T >
__device__ __forceinline__ T nvbio::cuda::scan ( val,
volatile T *  red 
)

intra-warp inclusive scan

Parameters
valper-threrad input value
tidxwarp thread index
redscan result storage (2*WARP_SIZE elements)

Definition at line 225 of file scan.h.

void nvbio::cuda::scan_test ( )
template<uint32 COUNT, typename T >
__device__ __forceinline__ T nvbio::cuda::scan_total ( volatile T *  red)

return the total from a scan_warp

Parameters
redscan result storage

Definition at line 232 of file scan.h.

template<typename T >
__device__ __forceinline__ T nvbio::cuda::scan_warp ( val,
const int32  tidx,
volatile T *  red 
)

intra-warp inclusive scan

Parameters
valper-threrad input value
tidxwarp thread index
redscan result storage (2*WARP_SIZE elements)

Definition at line 55 of file scan.h.

template<typename T >
__device__ __forceinline__ T nvbio::cuda::scan_warp_total ( volatile T *  red)

return the total from a scan_warp

Parameters
redscan result storage

Definition at line 73 of file scan.h.

size_t nvbio::cuda::smem_allocation_unit ( const cudaDeviceProp &  properties)
inline

Definition at line 64 of file arch_inl.h.

template<uint32 N>
NVBIO_FORCEINLINE NVBIO_HOST_DEVICE void nvbio::cuda::syncthreads ( )

a generic syncthreads() implementation to synchronize contiguous blocks of N threads at a time

Definition at line 279 of file arch_inl.h.

size_t nvbio::cuda::warp_allocation_multiple ( const cudaDeviceProp &  properties)
inline

Definition at line 95 of file arch_inl.h.