API Quick Reference#
This appendix is a condensed reference for the cuda-oxide device and host APIs.
For full documentation, run cargo doc --no-deps --open from the workspace
root.
Attributes and Macros#
Kernel and Device Attributes#
use cuda_device::{kernel, device, launch_bounds, cluster_launch};
#[kernel]
pub fn vecadd(a: &[f32], b: &[f32], mut c: DisjointSlice<f32>) { /* ... */ }
#[kernel]
#[launch_bounds(256, 2)]
pub fn tuned_kernel(data: &mut [f32]) { /* ... */ }
#[kernel]
#[cluster_launch(4, 1, 1)]
pub fn cluster_kernel(data: &mut [f32]) { /* ... */ }
#[device]
fn helper(x: f32) -> f32 { x * x }
Attribute |
Purpose |
|---|---|
|
Mark a function as a GPU kernel entry point ( |
|
Mark a helper function or |
|
Occupancy hints for register allocation |
|
Set compile-time cluster dimensions (Hopper+) |
|
Mark as convergent (barrier semantics) |
|
Mark as side-effect free |
|
Mark as read-only |
Output Macros#
use cuda_device::{gpu_printf, gpu_assert};
gpu_printf!("thread %d: val = %f\n", idx as i32, val as f64);
gpu_assert!(val >= 0.0);
Macro |
Purpose |
|---|---|
|
Device-side formatted output (lowers to |
|
Runtime assertion; calls |
Thread Identification#
use cuda_device::thread;
let idx = thread::index_1d(); // ThreadIndex (1D grids)
let idx2d = thread::index_2d(row_stride); // Option<ThreadIndex> (2D grids)
let tid_x = thread::threadIdx_x(); // u32
let bid_x = thread::blockIdx_x(); // u32
let bdim_x = thread::blockDim_x(); // u32
Function |
Returns |
Description |
|---|---|---|
|
|
Unique linear index (1D grids) |
|
|
Unique linear index (2D grids) |
|
|
2D row index |
|
|
2D column index |
|
|
Thread index within block |
|
|
Block index within grid |
|
|
Block dimensions |
thread::index_2d returns None when the computed column exceeds
row_stride — use it to skip the right-edge tail in non-aligned 2D kernels.
Safe Parallel Writes — DisjointSlice#
use cuda_device::{DisjointSlice, thread};
#[kernel]
pub fn vecadd(a: &[f32], b: &[f32], mut c: DisjointSlice<f32>) {
let idx = thread::index_1d();
if let Some(elem) = c.get_mut(idx) {
*elem = a[idx.get()] + b[idx.get()];
}
}
Method |
Signature |
Description |
|---|---|---|
|
|
Bounds-checked mutable access |
|
|
Unsafe, unchecked access |
|
|
Number of elements |
Synchronization#
Block-Level#
thread::sync_threads(); // __syncthreads() equivalent
Managed Barriers (Hopper+)#
use cuda_device::{ManagedBarrier, TmaBarrierHandle, Uninit, Ready};
// Typestate lifecycle: Uninit → Ready → Invalidated
let bar: TmaBarrierHandle<Uninit> = TmaBarrierHandle::from_static(ptr);
let bar: TmaBarrierHandle<Ready> = unsafe { bar.init(thread_count) };
let token = bar.arrive();
bar.wait(token);
unsafe { bar.inval() };
Operation |
Description |
|---|---|
|
Initialize barrier with expected arrival count |
|
Signal arrival, returns |
|
Arrive and set expected TX byte count (for TMA) |
|
Block until all arrivals + TX complete |
|
Invalidate barrier (cleanup) |
Warp Primitives#
use cuda_device::warp;
let lane = warp::lane_id(); // 0–31
let wid = warp::warp_id();
// Shuffle
let partner = warp::shuffle_xor_f32(val, mask);
let from_above = warp::shuffle_down_f32(val, delta);
let from_below = warp::shuffle_up_f32(val, delta);
let from_lane = warp::shuffle_f32(val, src_lane);
// i32 variants
let partner_i = warp::shuffle_xor_i32(val, mask);
// Vote
let all_true = warp::all(predicate);
let any_true = warp::any(predicate);
let mask = warp::ballot(predicate);
let count = warp::popc(mask);
Shuffle Operations#
Function |
Description |
|---|---|
|
Exchange with lane |
|
Read from lane |
|
Read from lane |
|
Read from specific lane |
Vote Operations#
Function |
Returns |
Description |
|---|---|---|
|
|
True if predicate holds for all lanes |
|
|
True if predicate holds for any lane |
|
|
Bitmask of lanes where predicate is true |
|
|
Population count of set bits |
Atomics#
Scoped GPU Atomics#
use cuda_device::atomic::{DeviceAtomicU32, AtomicOrdering};
static COUNTER: DeviceAtomicU32 = DeviceAtomicU32::new(0);
// In kernel:
COUNTER.fetch_add(1, AtomicOrdering::Relaxed);
let old = COUNTER.load(AtomicOrdering::Acquire);
Scope |
Types |
|---|---|
|
|
|
|
|
|
core::sync::atomic types (AtomicU32, AtomicBool, etc.) also compile to
GPU code, defaulting to system scope.
TMA — Tensor Memory Accelerator (Hopper+)#
use cuda_device::tma::TmaDescriptor;
use cuda_device::tma::{cp_async_bulk_tensor_2d_g2s, cp_async_bulk_commit_group};
// Host: build descriptor (128 bytes, opaque)
// Device: issue async bulk copy
cp_async_bulk_tensor_2d_g2s(smem_ptr, &desc, coord_x, coord_y, barrier_ptr);
cp_async_bulk_commit_group();
Function |
Description |
|---|---|
|
Global → shared async bulk copy |
|
Shared → global async bulk copy |
|
Multicast to all CTAs in cluster |
|
Commit outstanding copies |
|
Wait until ≤ n groups remain |
Cluster Programming (Hopper+)#
use cuda_device::cluster;
let rank = cluster::block_rank(); // This block's rank in the cluster
let size = cluster::cluster_size(); // Number of blocks in cluster
cluster::cluster_sync(); // Barrier across all cluster blocks
// Distributed Shared Memory
let remote_ptr = cluster::map_shared_rank(local_ptr, target_rank);
let val = cluster::dsmem_read_u32(remote_ptr);
Tensor Cores — WGMMA (Hopper, SM 90)#
use cuda_device::wgmma;
wgmma::wgmma_fence();
wgmma::wgmma_commit_group();
wgmma::wgmma_wait_group::<0>();
Warpgroup MMA: 4 warps (128 threads) issue matrix multiply-accumulate from shared memory. Operands described by SMEM descriptors; accumulator in registers.
Tensor Cores — tcgen05 (Blackwell, SM 100+)#
use cuda_device::tcgen05::{TmemGuard, TmemUninit, TmemReady};
use cuda_device::SharedArray;
static mut TMEM_SLOT: SharedArray<u32, 1, 4> = SharedArray::UNINIT;
let guard = TmemGuard::<TmemUninit, 512>::from_static(&raw mut TMEM_SLOT as *mut u32);
let guard = unsafe { guard.alloc() }; // TmemUninit → TmemReady
// ... issue MMA, read results via guard.address() ...
let _guard = unsafe { guard.dealloc() }; // TmemReady → TmemDeallocated
Single-thread MMA issue into dedicated Tensor Memory (TMEM). TmemGuard
manages TMEM lifetime with typestate: TmemUninit → TmemReady → TmemDeallocated.
N_COLS must be a power of 2 in the range [32, 512].
Host-Side: Kernel Launch#
Synchronous#
use cuda_core::{CudaContext, DeviceBuffer, LaunchConfig};
use cuda_host::{cuda_launch, load_kernel_module};
let ctx = CudaContext::new(0).unwrap();
let stream = ctx.default_stream();
let module = load_kernel_module(&ctx, "vecadd").unwrap();
let a = DeviceBuffer::from_host(&stream, &a_host).unwrap();
let b = DeviceBuffer::from_host(&stream, &b_host).unwrap();
let mut output = DeviceBuffer::<f32>::zeroed(&stream, n).unwrap();
cuda_launch! {
kernel: vecadd,
stream: stream,
module: module,
config: LaunchConfig::for_num_elems(n),
args: [slice(a), slice(b), slice_mut(output)]
}.unwrap();
Async#
use cuda_async::device_operation::DeviceOperation;
use cuda_host::cuda_launch_async;
let op = cuda_launch_async! {
kernel: vecadd,
module: module,
config: LaunchConfig::for_num_elems(n),
args: [slice(a), slice(b), slice_mut(output)]
};
op.sync()?; // blocking
// or: op.await?; // async with tokio
LaunchConfig#
Method |
Description |
|---|---|
|
Auto-configure grid/block for |
|
Direct struct construction |
Debug Facilities#
use cuda_device::debug;
let t = debug::clock64(); // Cycle counter
debug::trap(); // Abort kernel
debug::breakpoint(); // cuda-gdb breakpoint
cuda_device::barrier::nanosleep(1000); // Sleep ~1μs
debug::prof_trigger::<7>(); // Nsight profiler trigger
Quick Reference Tables#
cuda-device Modules#
Module |
Description |
Min SM |
|---|---|---|
|
Thread/block IDs, |
All |
|
|
All |
|
|
All |
|
Shuffle, vote, match, lane/warp ID |
All |
|
Scoped atomics (device/block/system) |
sm_70+ |
|
|
All |
|
|
All |
|
Grid-scoped |
sm_70+ |
|
Typed handles, warp/block reductions and scans |
All |
|
|
sm_90+ |
|
Thread block clusters, DSMEM |
sm_90+ |
|
|
sm_90+ |
|
Warpgroup MMA (fence/commit/wait) |
sm_90 |
|
5th-gen tensor cores, TMEM, |
sm_100+ |
|
|
All |
|
Cluster Launch Control |
sm_100+ |
Crate Map#
Crate |
Role |
|---|---|
|
Device intrinsics and types ( |
|
Proc macros ( |
|
|
|
Safe RAII wrappers ( |
|
|
|
Raw |
|
Cargo subcommand ( |