Operations and Intrinsics#
cuTile Rust provides a rich set of operations for GPU computation. All operations work on tiles and leverage GPU parallelism.
Loading and Storing#
Basic Load/Store#
// Load entire output tile
let tile = load_tile_mut(tensor);
// Store result
tensor.store(tile);
Load Like (Positional Loading)#
Load from a dynamic tensor at the position matching another tile:
// Load from x at the same position as output tile z
let tile_x = load_tile_like_2d(x, z);
let tile_y = load_tile_like_2d(y, z);
This is the most common pattern for element-wise operations.
Partitioned Loading#
For explicit control over which tile to load:
let part = tensor.partition(const_shape![16, 16]);
let tile = part.load([row_idx, col_idx]);
Elementwise Operations#
Standard math operations work element-by-element on tiles:
Arithmetic#
let c = a + b; // Addition
let c = a - b; // Subtraction
let c = a * b; // Multiplication
let c = a / b; // Division
With Scalars#
let scale = 2.0f32;
let scaled = tile * scale; // Multiply by scalar
let shifted = tile + 1.0f32; // Add scalar
Compound Operations#
// SAXPY: y = a*x + y
let result = a * x + y;
// Fused multiply-add (more accurate)
let result = fma(a, x, y);
// Fused multiply-add with rounding mode
let result = fma_op(a, x, y, rounding_mode);
Mathematical Functions#
Exponential and Logarithmic#
let y = exp(x); // e^x
let y = exp2(x); // 2^x (faster on GPU)
let y = log(x); // Natural log (ln)
let y = log2(x); // Log base 2
let y = sqrt(x, "rn"); // Square root (requires rounding mode)
let y = rsqrt(x); // 1/sqrt(x) (fast reciprocal sqrt)
Trigonometric#
let y = sin(x); // Sine
let y = cos(x); // Cosine
let y = tanh(x); // Hyperbolic tangent
Other#
let y = absf(x); // Absolute value (float)
let y = absi(x); // Absolute value (integer)
let y = negf(x); // Negation (float)
let y = negi(x); // Negation (integer)
let y = ceil(x, "rn"); // Ceiling (requires rounding mode)
let y = floor(x); // Floor
Reduction Operations#
Reduce along an axis to produce a smaller tile:
Max and Sum#
// Input: Tile<f32, {[BM, BN]}>
// Reduce across columns (axis=1) → Tile<f32, {[BM]}>
let row_max = reduce_max(tile, 1i32);
let row_sum = reduce_sum(tile, 1);
// Reduce across rows (axis=0) → Tile<f32, {[BN]}>
let col_max = reduce_max(tile, 0i32);
let col_sum = reduce_sum(tile, 0);
Min#
let row_min = reduce_min(tile, 1);
let col_min = reduce_min(tile, 0);
Prod#
let row_prod = reduce_prod(tile, 1);
let col_prod = reduce_prod(tile, 0);
Matrix Operations#
Matrix Multiply-Accumulate (MMA)#
The workhorse of GPU computing:
// C = A @ B + C
let c = mma(a, b, c);
// For accumulation loop:
let mut acc = constant(0.0f32, const_shape![BM, BN]);
for i in 0..K {
let a_tile = load_a(i);
let b_tile = load_b(i);
acc = mma(a_tile, b_tile, acc);
}
Shape requirements:
A:
[M, K]B:
[K, N]C:
[M, N]Result:
[M, N]
Transpose / Permute#
// Define permutation
let transpose: Array<{[1, 0]}> = Array::<{[1, 0]}> {
dims: &[1i32, 0i32],
};
// Apply transpose
let transposed = permute(tile, transpose);
// [M, N] → [N, M]
Broadcasting#
Expand a smaller tile to match a larger shape:
Scalar Broadcasting#
// Broadcast scalar to tile
let scalar = 2.0f32;
let tile = scalar.broadcast(const_shape![64, 64]);
// Creates 64×64 tile filled with 2.0
Dimension Broadcasting#
// Broadcast [BM] to [BM, BN]
let row_values: Tile<f32, {[BM]}> = ...;
let expanded = row_values
.reshape(const_shape![BM, 1])
.broadcast(const_shape![BM, BN]);
Common Pattern: Softmax Normalization#
// Get max per row: [BM, BN] → [BM]
let row_max = reduce_max(tile, 1);
// Broadcast back: [BM] → [BM, BN]
let max_broadcast = row_max
.reshape(const_shape![BM, 1])
.broadcast(tile.shape());
// Subtract max from each element
let normalized = tile - max_broadcast;
Shape Operations#
Reshape#
Change shape without changing data (total elements must match):
// Flatten 2D to 1D
let flat = tile.reshape(const_shape![BM * BN]);
// Reshape for broadcasting
let col_vector = row.reshape(const_shape![BM, 1]);
Get Shape#
let shape = tensor.shape();
let dim_0 = get_shape_dim(tensor.shape(), 0i32);
Comparison Operations#
// Element-wise comparisons return bool tiles
let mask = gt_tile(a, b); // a > b
let mask = ge_tile(a, b); // a >= b
let mask = lt_tile(a, b); // a < b
let mask = le_tile(a, b); // a <= b
let mask = eq_tile(a, b); // a == b
Select (Conditional)#
// Select elements based on mask
let result = select(mask, if_true, if_false);
Control Flow Operations#
Tile-Level Max/Min#
// Element-wise max/min of two tiles
let result = max_tile(a, b);
let result = min_tile(a, b);
Constants#
// Create constant tile
let zeros = constant(0.0f32, const_shape![64, 64]);
let ones = constant(1.0f32, const_shape![64, 64]);
let neg_inf = constant(f32::NEG_INFINITY, const_shape![BM, 1]);
Iota (Index Generation)#
// Create [0, 1, 2, 3, ...] tile
let indices: Tile<i32, {[64]}> = iota(const_shape![64]);
Utility Operations#
Print (Debugging)#
cuda_tile_print!("Value at tile ({}, {}): {}\n",
pid.0, pid.1, some_value);
Warning
GPU printing is slow and should only be used for debugging small grids.
Type Conversion#
let float_tile: Tile<f32, S> = convert_tile(int_tile);
let half_tile: Tile<f16, S> = convert_tile(float_tile);
Common Operation Patterns#
Element-wise with Broadcast#
fn scale_and_shift<const S: [i32; 2]>(
x: Tile<f32, S>, scale: f32, shift: f32
) -> Tile<f32, S> {
let s = scale.broadcast(x.shape());
let b = shift.broadcast(x.shape());
x * s + b
}
Numerically Stable Softmax#
fn softmax<const BM: i32, const BN: i32>(
x: Tile<f32, { [BM, BN] }>
) -> Tile<f32, { [BM, BN] }> {
// Subtract max for numerical stability
let max: Tile<f32, { [BM, BN] }> = reduce_max(x, 1i32)
.reshape(const_shape![BM, 1])
.broadcast(const_shape![BM, BN]);
let stable = x - max;
// Compute softmax
let exp_x = exp(stable);
let sum: Tile<f32, { [BM, BN] }> = reduce_sum(exp_x, 1)
.reshape(const_shape![BM, 1])
.broadcast(const_shape![BM, BN]);
true_div(exp_x, sum)
}
Tiled Matrix Multiply#
fn tiled_gemm<E: ElementType, const BM: i32, const BN: i32, const BK: i32, const K: i32>(
z: &mut Tensor<E, { [BM, BN] }>,
x: &Tensor<E, { [-1, K] }>,
y: &Tensor<E, { [K, -1] }>,
) {
let part_x = x.partition(const_shape![BM, BK]);
let part_y = y.partition(const_shape![BK, BN]);
let pid: (i32, i32, i32) = get_tile_block_id();
let mut acc = constant(0.0f32, const_shape![BM, BN]);
for i in 0i32..(K / BK) {
let tile_x = part_x.load([pid.0, i]);
let tile_y = part_y.load([i, pid.1]);
acc = mma(tile_x, tile_y, acc);
}
z.store(acc);
}
Summary#
Category |
Key Operations |
|---|---|
Load/Store |
|
Arithmetic |
|
Math |
|
Reduction |
|
Matrix |
|
Shape |
|
Compare |
|
Element-wise |
|
Constants |
|
Continue to Async Execution to learn about concurrent CPU/GPU work.