Neural Networks¶

`warpconvnet.nn` ¶

`BilateralFilter` ¶

Bases: Module

KNN/radius bilateral filter (Gaussian on xyz + feat).

Source code in warpconvnet/nn/modules/bilateral.py

class BilateralFilter(nn.Module):
    """KNN/radius bilateral filter (Gaussian on xyz + feat)."""

    def __init__(
        self,
        sigma_xyz: float = 0.05,
        sigma_feat: float = 20.0,
        k: int = 16,
        mode: str = "knn",
        radius_mult: float = 3.0,
        chunk_size: int = 32768,
    ):
        super().__init__()
        self.sigma_xyz = sigma_xyz
        self.sigma_feat = sigma_feat
        self.k = k
        self.mode = mode
        self.radius_mult = radius_mult
        self.chunk_size = chunk_size

    def forward(
        self,
        src_xyz: Tensor,
        src_feat: Tensor,
        src_value: Tensor,
        query_xyz: Tensor | None = None,
        query_feat: Tensor | None = None,
    ) -> Tensor:
        return bilateral_filter(
            src_xyz=src_xyz,
            src_feat=src_feat,
            src_value=src_value,
            query_xyz=query_xyz,
            query_feat=query_feat,
            sigma_xyz=self.sigma_xyz,
            sigma_feat=self.sigma_feat,
            k=self.k,
            mode=self.mode,
            radius_mult=self.radius_mult,
            chunk_size=self.chunk_size,
        )

`BilateralFilterGrid` ¶

Bases: Module

Splat-blur-slice bilateral filter on a sparse d-cube grid (Barron-style).

Source code in warpconvnet/nn/modules/bilateral.py

class BilateralFilterGrid(nn.Module):
    """Splat-blur-slice bilateral filter on a sparse d-cube grid (Barron-style)."""

    def __init__(self, sigma_xyz: float = 0.05, sigma_feat: float = 20.0):
        super().__init__()
        self.sigma_xyz = sigma_xyz
        self.sigma_feat = sigma_feat

    def forward(self, src_xyz: Tensor, src_feat: Tensor, src_value: Tensor) -> Tensor:
        return bilateral_filter_grid(
            src_xyz,
            src_feat,
            src_value,
            sigma_xyz=self.sigma_xyz,
            sigma_feat=self.sigma_feat,
        )

`BilateralFilterGridCached` ¶

Bases: Module

Build-once / filter-many sparse d-cube bilateral grid.

Positions fixed across calls (e.g., per-frame in video), only features differ. Call build_grid once, then forward repeatedly.

Source code in warpconvnet/nn/modules/bilateral.py

class BilateralFilterGridCached(nn.Module):
    """Build-once / filter-many sparse d-cube bilateral grid.

    Positions fixed across calls (e.g., per-frame in video), only features
    differ. Call ``build_grid`` once, then ``forward`` repeatedly.
    """

    def __init__(self, sigma_xyz: float = 0.05, sigma_feat: float = 20.0):
        super().__init__()
        self.sigma_xyz = sigma_xyz
        self.sigma_feat = sigma_feat
        self._grid: BilateralGrid | None = None

    def build_grid(self, src_xyz: Tensor, src_feat: Tensor) -> BilateralFilterGridCached:
        pos = torch.cat([src_xyz / self.sigma_xyz, src_feat / self.sigma_feat], dim=-1)
        self._grid = BilateralGrid.build(pos)
        return self

    def forward(self, src_value: Tensor) -> Tensor:
        if self._grid is None:
            raise RuntimeError("Call build_grid(src_xyz, src_feat) before forward().")
        return self._grid.filter(src_value, normalize=True)

    @property
    def num_vertices(self) -> int:
        if self._grid is None:
            return 0
        return self._grid.num_vertices

`BilateralPermutohedralFilter` ¶

Bases: Module

Bilateral (xyz + color) permutohedral filter.

Lattice coords = concat(xyz / sigma_xyz, feat / sigma_feat). xyz alone is just a Gaussian blur; feat (e.g. RGB) is what makes it edge-preserving. Constraint: D_xyz + D_feat <= 6 (lattice axes capped at 7).

Source code in warpconvnet/nn/modules/permutohedral.py

class BilateralPermutohedralFilter(nn.Module):
    """Bilateral (xyz + color) permutohedral filter.

    Lattice coords = concat(xyz / sigma_xyz, feat / sigma_feat). xyz alone is
    just a Gaussian blur; feat (e.g. RGB) is what makes it edge-preserving.
    Constraint: D_xyz + D_feat <= 6 (lattice axes capped at 7).
    """

    def __init__(self, sigma_xyz: float = 0.05, sigma_feat: float = 20.0):
        super().__init__()
        self.sigma_xyz = sigma_xyz
        self.sigma_feat = sigma_feat

    def forward(
        self,
        src_xyz: Tensor,
        src_feat: Tensor,
        src_value: Tensor,
        query_xyz: Tensor | None = None,
        query_feat: Tensor | None = None,
        *,
        normalize: bool = True,
    ) -> Tensor:
        return bilateral_permutohedral_filter(
            src_xyz=src_xyz,
            src_feat=src_feat,
            src_value=src_value,
            sigma_xyz=self.sigma_xyz,
            sigma_feat=self.sigma_feat,
            query_xyz=query_xyz,
            query_feat=query_feat,
            normalize=normalize,
        )

`BilateralPermutohedralFilterCached` ¶

Bases: Module

Build-once / filter-many bilateral permutohedral.

For iterative bilateral solving on fixed (xyz, feat). Call build_lattice with the source xyz + feat once, then call forward repeatedly with different value tensors.

Source code in warpconvnet/nn/modules/permutohedral.py

class BilateralPermutohedralFilterCached(nn.Module):
    """Build-once / filter-many bilateral permutohedral.

    For iterative bilateral solving on fixed (xyz, feat). Call ``build_lattice``
    with the source xyz + feat once, then call forward repeatedly with
    different value tensors.
    """

    def __init__(self, sigma_xyz: float = 0.05, sigma_feat: float = 20.0):
        super().__init__()
        self.sigma_xyz = sigma_xyz
        self.sigma_feat = sigma_feat
        self._lattice: PermutohedralLattice | None = None

    def build_lattice(
        self, src_xyz: Tensor, src_feat: Tensor
    ) -> BilateralPermutohedralFilterCached:
        d_xyz = src_xyz.shape[1]
        d_feat = src_feat.shape[1]
        if d_xyz + d_feat > 6:
            raise ValueError(f"D_xyz + D_feat = {d_xyz + d_feat} > 6; lattice axes capped at 7.")
        positions = torch.cat(
            [src_xyz / self.sigma_xyz, src_feat / self.sigma_feat],
            dim=-1,
        )
        self._lattice = PermutohedralLattice.build(positions)
        return self

    def forward(
        self,
        src_value: Tensor,
        query_xyz: Tensor | None = None,
        query_feat: Tensor | None = None,
        *,
        normalize: bool = True,
    ) -> Tensor:
        if self._lattice is None:
            raise RuntimeError("Call build_lattice(src_xyz, src_feat) before forward().")
        if query_xyz is None and query_feat is None:
            qp = None
        else:
            if query_xyz is None or query_feat is None:
                raise ValueError("Pass both query_xyz and query_feat, or neither.")
            qp = torch.cat(
                [query_xyz / self.sigma_xyz, query_feat / self.sigma_feat],
                dim=-1,
            )
        return self._lattice.filter(src_value, query_positions=qp, normalize=normalize)

    @property
    def num_vertices(self) -> int:
        if self._lattice is None:
            return 0
        return int(self._lattice.unique_keys.shape[0])

`FastBilateralSolver` ¶

Bases: Module

Confidence-weighted bilateral smoothing via PCG (Barron & Poole 2015).

Source code in warpconvnet/nn/modules/bilateral.py

class FastBilateralSolver(nn.Module):
    """Confidence-weighted bilateral smoothing via PCG (Barron & Poole 2015)."""

    def __init__(
        self,
        sigma_xyz: float = 0.05,
        sigma_feat: float = 20.0,
        lam: float = 128.0,
        max_iters: int = 25,
        tol: float = 1e-5,
    ):
        super().__init__()
        self.sigma_xyz = sigma_xyz
        self.sigma_feat = sigma_feat
        self.lam = lam
        self.max_iters = max_iters
        self.tol = tol

    def forward(
        self,
        src_xyz: Tensor,
        src_feat: Tensor,
        target: Tensor,
        confidence: Tensor,
    ) -> Tensor:
        return fast_bilateral_solver(
            src_xyz=src_xyz,
            src_feat=src_feat,
            target=target,
            confidence=confidence,
            sigma_xyz=self.sigma_xyz,
            sigma_feat=self.sigma_feat,
            lam=self.lam,
            max_iters=self.max_iters,
            tol=self.tol,
        )

`PermutohedralFilter` ¶

Bases: Module

Gaussian filter via permutohedral lattice (Adams, Baek, Davis 2010).

Pre-scales positions by sigmas (per-axis) or sigma (scalar) and runs splat -> blur -> slice. Lattice coords have d+1 axes so the input feature dim is bounded to d <= 6 by the underlying PackedHashTable128.

Source code in warpconvnet/nn/modules/permutohedral.py

class PermutohedralFilter(nn.Module):
    """Gaussian filter via permutohedral lattice (Adams, Baek, Davis 2010).

    Pre-scales positions by ``sigmas`` (per-axis) or ``sigma`` (scalar) and
    runs splat -> blur -> slice. Lattice coords have d+1 axes so the input
    feature dim is bounded to d <= 6 by the underlying PackedHashTable128.
    """

    def __init__(
        self,
        sigma: float | None = None,
        sigmas: Sequence[float] | None = None,
    ):
        super().__init__()
        if (sigma is None) == (sigmas is None):
            raise ValueError("Pass exactly one of sigma (scalar) or sigmas (per-axis).")
        self.sigma = sigma
        if sigmas is not None:
            self.register_buffer(
                "sigmas",
                torch.as_tensor(list(sigmas), dtype=torch.float32),
            )
        else:
            self.sigmas = None

    def forward(
        self,
        positions: Tensor,
        features: Tensor,
        query_positions: Tensor | None = None,
    ) -> Tensor:
        sigmas = self.sigmas
        if sigmas is not None:
            sigmas = sigmas.to(device=positions.device, dtype=positions.dtype)
        return permutohedral_filter(
            positions=positions,
            features=features,
            sigmas=sigmas,
            sigma=self.sigma,
            query_positions=query_positions,
        )

`PermutohedralFilterCached` ¶

Bases: Module

Build-once / filter-many permutohedral lattice.

For pipelines where positions are fixed (video frame sequence, iterative bilateral solving) and only features change. Call build_lattice once, then forward repeatedly with different feature tensors.

Source code in warpconvnet/nn/modules/permutohedral.py

class PermutohedralFilterCached(nn.Module):
    """Build-once / filter-many permutohedral lattice.

    For pipelines where positions are fixed (video frame sequence, iterative
    bilateral solving) and only features change. Call ``build_lattice`` once,
    then ``forward`` repeatedly with different feature tensors.
    """

    def __init__(
        self,
        sigma: float | None = None,
        sigmas: Sequence[float] | None = None,
    ):
        super().__init__()
        if (sigma is None) == (sigmas is None):
            raise ValueError("Pass exactly one of sigma (scalar) or sigmas (per-axis).")
        self.sigma = sigma
        if sigmas is not None:
            self.register_buffer(
                "sigmas",
                torch.as_tensor(list(sigmas), dtype=torch.float32),
            )
        else:
            self.sigmas = None
        self._lattice: PermutohedralLattice | None = None

    def build_lattice(self, positions: Tensor) -> PermutohedralFilterCached:
        sigmas = self.sigmas
        if sigmas is not None:
            sigmas = sigmas.to(device=positions.device, dtype=positions.dtype)
            scaled = positions / sigmas
        else:
            scaled = positions / self.sigma
        self._lattice = PermutohedralLattice.build(scaled)
        return self

    def forward(
        self,
        features: Tensor,
        query_positions: Tensor | None = None,
    ) -> Tensor:
        if self._lattice is None:
            raise RuntimeError("Call build_lattice(positions) before forward().")
        if query_positions is not None:
            sigmas = self.sigmas
            if sigmas is not None:
                sigmas = sigmas.to(device=query_positions.device, dtype=query_positions.dtype)
                query_positions = query_positions / sigmas
            else:
                query_positions = query_positions / self.sigma
        return self._lattice.filter(features, query_positions=query_positions)

    @property
    def num_vertices(self) -> int:
        if self._lattice is None:
            return 0
        return int(self._lattice.unique_keys.shape[0])

Modules¶

Activations¶

`warpconvnet.nn.modules.activations` ¶

`DropPath` ¶

Bases: BaseSpatialModule

Stochastic depth regularization.

Parameters:	`drop_prob` (`float`, default: `0.0` ) – Probability of dropping a sample. Defaults to `0.0`. `scale_by_keep` (`bool`, default: `True` ) – If `True` the output is scaled by `1 - drop_prob`. Defaults to `True`.

Source code in warpconvnet/nn/modules/activations.py

class DropPath(BaseSpatialModule):
    """Stochastic depth regularization.

    Parameters
    ----------
    drop_prob : float, optional
        Probability of dropping a sample. Defaults to ``0.0``.
    scale_by_keep : bool, optional
        If ``True`` the output is scaled by ``1 - drop_prob``. Defaults to ``True``.
    """

    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
        super().__init__()
        self.drop_prob = drop_prob
        self.scale_by_keep = scale_by_keep

    def forward(self, x: Union[Geometry, Tensor]):  # noqa: F821
        if isinstance(x, Geometry):
            return x.replace(
                batched_features=drop_path(
                    x.feature_tensor, self.drop_prob, self.training, self.scale_by_keep
                )
            )
        else:
            return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)

    def extra_repr(self):
        return f"drop_prob={round(self.drop_prob, 3): 0.3f}"

`ELU` ¶

Bases: BaseSpatialModule

Applies the ELU activation to Geometry features.

Source code in warpconvnet/nn/modules/activations.py

class ELU(BaseSpatialModule):
    """Applies the ELU activation to ``Geometry`` features."""

    def forward(self, input: Geometry):  # noqa: F821
        return elu(input)

`GELU` ¶

Bases: BaseSpatialModule

Applies the GELU activation to Geometry features.

Source code in warpconvnet/nn/modules/activations.py

class GELU(BaseSpatialModule):
    """Applies the GELU activation to ``Geometry`` features."""

    def forward(self, input: Geometry):  # noqa: F821
        return gelu(input)

`LeakyReLU` ¶

Bases: Module

Applies the LeakyReLU activation to Geometry features.

Source code in warpconvnet/nn/modules/activations.py

class LeakyReLU(nn.Module):
    """Applies the LeakyReLU activation to ``Geometry`` features."""

    def forward(self, input: Geometry):  # noqa: F821
        return leaky_relu(input)

`LogSoftmax` ¶

Bases: BaseSpatialModule

Applies the log_softmax activation to Geometry features.

Source code in warpconvnet/nn/modules/activations.py

class LogSoftmax(BaseSpatialModule):
    """Applies the ``log_softmax`` activation to ``Geometry`` features."""

    def forward(self, input: Geometry):  # noqa: F821
        return log_softmax(input)

`ReLU` ¶

Bases: BaseSpatialModule

Apply the ReLU activation to Geometry features.

Parameters:	`inplace` (`bool`, default: `False` ) – Whether to perform the operation in-place. Defaults to `False`.

Source code in warpconvnet/nn/modules/activations.py

class ReLU(BaseSpatialModule):
    """Apply the ReLU activation to ``Geometry`` features.

    Parameters
    ----------
    inplace : bool, optional
        Whether to perform the operation in-place. Defaults to ``False``.
    """

    def __init__(self, inplace: bool = False):
        super().__init__()
        self.relu = nn.ReLU(inplace=inplace)

    def __repr__(self):
        return f"{self.__class__.__name__}(inplace={self.relu.inplace})"

    def forward(self, input: Geometry):  # noqa: F821
        return apply_feature_transform(input, self.relu)

`SiLU` ¶

Bases: BaseSpatialModule

Applies the SiLU activation to Geometry features.

Source code in warpconvnet/nn/modules/activations.py

class SiLU(BaseSpatialModule):
    """Applies the SiLU activation to ``Geometry`` features."""

    def forward(self, input: Geometry):  # noqa: F821
        return silu(input)

`Sigmoid` ¶

Bases: BaseSpatialModule

Applies the sigmoid activation to Geometry features.

Source code in warpconvnet/nn/modules/activations.py

class Sigmoid(BaseSpatialModule):
    """Applies the sigmoid activation to ``Geometry`` features."""

    def forward(self, input: Geometry):  # noqa: F821
        return sigmoid(input)

`Softmax` ¶

Bases: BaseSpatialModule

Applies the softmax activation to Geometry features.

Source code in warpconvnet/nn/modules/activations.py

class Softmax(BaseSpatialModule):
    """Applies the ``softmax`` activation to ``Geometry`` features."""

    def forward(self, input: Geometry):  # noqa: F821
        return softmax(input)

`Tanh` ¶

Bases: BaseSpatialModule

Applies the tanh activation to Geometry features.

Source code in warpconvnet/nn/modules/activations.py

class Tanh(BaseSpatialModule):
    """Applies the ``tanh`` activation to ``Geometry`` features."""

    def forward(self, input: Geometry):  # noqa: F821
        return tanh(input)

`drop_path(x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True)` ¶

Apply stochastic depth to the input tensor.

Parameters:

x (``torch.Tensor``) –

Input tensor to apply stochastic depth to.
drop_prob (float, default: 0.0 ) –

Probability of dropping a sample. Defaults to 0.0.
training (bool, default: False ) –

Whether the module is in training mode. Defaults to False.
scale_by_keep (bool, default: True ) –

If True the output is scaled by 1 - drop_prob. Defaults to True.

Source code in warpconvnet/nn/modules/activations.py

def drop_path(
    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
):
    """Apply stochastic depth to the input tensor.

    Parameters
    ----------
    x : ``torch.Tensor``
        Input tensor to apply stochastic depth to.
    drop_prob : float, optional
        Probability of dropping a sample. Defaults to ``0.0``.
    training : bool, optional
        Whether the module is in training mode. Defaults to ``False``.
    scale_by_keep : bool, optional
        If ``True`` the output is scaled by ``1 - drop_prob``. Defaults to ``True``.
    """
    if drop_prob == 0.0 or not training:
        return x
    keep_prob = 1 - drop_prob
    shape = (x.shape[0],) + (1,) * (x.ndim - 1)
    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
    if keep_prob > 0.0 and scale_by_keep:
        random_tensor.div_(keep_prob)
    return x * random_tensor

Attention¶

`warpconvnet.nn.modules.attention` ¶

`Attention` ¶

Bases: Module

Source code in warpconvnet/nn/modules/attention.py

class Attention(nn.Module):
    def __init__(
        self,
        dim: int,
        num_heads: int = 8,
        qkv_bias: bool = False,
        qk_scale: Optional[float] = None,
        attn_drop: float = 0.0,
        proj_drop: float = 0.0,
        enable_flash: bool = True,
        use_batched_qkv: bool = True,
    ):
        """
        Attention module with optional batched QKV for Muon optimization.

        Args:
            dim: Input feature dimension
            num_heads: Number of attention heads
            qkv_bias: Whether to use bias in QKV projection
            qk_scale: Scale factor for attention scores
            attn_drop: Attention dropout rate
            proj_drop: Output projection dropout rate
            enable_flash: Whether to use flash attention
            use_batched_qkv: If True, uses separate Q, K, V matrices stacked as [3, dim, dim]
                           for Muon optimization. Muon can orthogonalize the [dim, dim] matrices
                           more effectively than the concatenated [dim, 3*dim] matrix.
        """
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5
        self.enable_flash = enable_flash
        self.use_batched_qkv = use_batched_qkv

        if enable_flash:
            assert flash_attn is not None, "Make sure flash_attn is installed."
            self.attn_drop_p = attn_drop
        else:
            self.attn_drop = nn.Dropout(attn_drop)

        if use_batched_qkv:
            # Use BatchedLinear for Muon-friendly QKV projection
            self.qkv = BatchedLinear(dim, dim, num_matrices=3, bias=qkv_bias)
        else:
            # Original single linear layer approach
            self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)

        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(
        self,
        x: Float[Tensor, "B N C"],  # noqa: F821
        pos_enc: Optional[Float[Tensor, "B N C"]] = None,  # noqa: F821
        mask: Optional[Float[Tensor, "B N N"]] = None,  # noqa: F821
        num_points: Optional[Int[Tensor, "B"]] = None,  # noqa: F821
    ) -> Float[Tensor, "B N C"]:
        B, N, C = x.shape

        # Compute QKV with unified approach
        if pos_enc is not None and self.enable_flash:
            # Add positional encoding to input before QKV projection for flash attention
            qkv = self.qkv(x + pos_enc).reshape(B, N, 3, C)
        else:
            qkv = self.qkv(x).reshape(B, N, 3, C)

        # Reshape to [B, N, 3, num_heads, head_dim]
        qkv = qkv.reshape(B, N, 3, self.num_heads, C // self.num_heads)

        if not self.enable_flash:
            qkv = qkv.permute(2, 0, 3, 1, 4)
            q, k, v = (
                qkv[0],
                qkv[1],
                qkv[2],
            )  # make torchscript happy (cannot use tensor as tuple)

            # Apply positional encoding to the query and key (non-flash path)
            if pos_enc is not None:
                q = q + pos_enc.unsqueeze(1)
                k = k + pos_enc.unsqueeze(1)

            attn = (q @ k.transpose(-2, -1)) * self.scale
            if mask is not None:
                # mask is a bool key-validity mask [B, 1, M, M] (True = valid
                # key). Padded keys must be removed from the softmax, so set
                # their scores to -inf instead of adding the mask (which only
                # shifted valid scores by +1 and left padded keys participating).
                attn = attn.masked_fill(~mask, float("-inf"))
                # An empty batch element (num_points==0) leaves a fully-masked
                # row whose softmax is NaN; those padded-query rows are dropped
                # by zero_out_points anyway, so map NaN->0 to keep grads finite.
                # Only reachable when masking set a row fully to -inf, so the
                # scan is skipped on the unmasked hot path.
                attn = torch.nan_to_num(attn.softmax(dim=-1))
            else:
                attn = attn.softmax(dim=-1)
            attn = self.attn_drop(attn)

            x = attn @ v
            x = x.transpose(1, 2).reshape(B, N, C)
        else:
            # Flash attention path
            # Flash attention - preserve original dtype if possible
            original_dtype = qkv.dtype
            if qkv.dtype not in [torch.float16, torch.bfloat16]:
                # Convert to half precision for flash attention
                qkv_flash = qkv.half()
            else:
                qkv_flash = qkv

            if num_points is not None:
                # Padded batch: unpad to a variable-length packed layout so
                # padded tokens never enter the softmax. flash_attn_qkvpacked_func
                # has no mask argument, so masking is impossible on the dense
                # [B, N, ...] path; varlen is the only correct option (same
                # approach as PatchAttention).
                num_points_dev = num_points.to(qkv_flash.device)
                valid = (
                    torch.arange(N, device=qkv_flash.device)[None, :] < num_points_dev[:, None]
                )  # [B, N] bool
                qkv_unpad = qkv_flash[valid]  # [total, 3, num_heads, head_dim]
                # cumsum upcasts int32->int64, and flash requires int32 cu_seqlens,
                # so cast again after the pad.
                cu_seqlens = F.pad(num_points_dev.cumsum(0), (1, 0)).to(torch.int32)
                # N is the pad width (== max num_points), a valid upper bound
                # for max_seqlen. Using it avoids a per-forward device->host
                # sync from num_points.max().
                max_seqlen = N
                out_unpad = flash_attn.flash_attn_varlen_qkvpacked_func(
                    qkv_unpad,
                    cu_seqlens,
                    max_seqlen=max_seqlen,
                    dropout_p=self.attn_drop_p if self.training else 0.0,
                    softmax_scale=self.scale,
                ).reshape(-1, C)
                x = torch.zeros(B, N, C, dtype=out_unpad.dtype, device=out_unpad.device)
                x[valid] = out_unpad
            else:
                x = flash_attn.flash_attn_qkvpacked_func(
                    qkv_flash,
                    dropout_p=self.attn_drop_p if self.training else 0.0,
                    softmax_scale=self.scale,
                ).reshape(B, N, C)

            # Convert back to original dtype if necessary
            if x.dtype != original_dtype:
                x = x.to(original_dtype)

        x = self.proj(x)
        x = self.proj_drop(x)

        if num_points is not None:
            x = zero_out_points(x, num_points)
        return x

`PatchAttention` ¶

Bases: BaseSpatialModule

Source code in warpconvnet/nn/modules/attention.py

class PatchAttention(BaseSpatialModule):
    def __init__(
        self,
        dim: int,
        patch_size: int,
        num_heads: int = 8,
        qkv_bias: bool = False,
        qk_scale: Optional[float] = None,
        attn_drop: float = 0.0,
        proj_drop: float = 0.0,
        order: POINT_ORDERING = POINT_ORDERING.MORTON_XYZ,
        use_batched_qkv: bool = True,
        use_rope: bool = False,
        rope_base: int = 10_000,
    ):
        """
        Patch attention module with optional batched QKV for Muon optimization.

        Args:
            dim: Input feature dimension
            patch_size: Size of patches for attention computation
            num_heads: Number of attention heads
            qkv_bias: Whether to use bias in QKV projection
            qk_scale: Scale factor for attention scores
            attn_drop: Attention dropout rate
            proj_drop: Output projection dropout rate
            order: Point ordering for patch generation
            use_batched_qkv: If True, uses separate Q, K, V matrices stacked as [3, dim, dim]
                           for Muon optimization. Muon can orthogonalize the [dim, dim] matrices
                           more effectively than the concatenated [dim, 3*dim] matrix.
            use_rope: If True, apply 3D RoPE to Q and K via the fused CUDA
                kernel. Uses point-cloud coordinates for the rotation phase.
            rope_base: RoPE base. Use
                `warpconvnet.nn.modules.rope.suggest_voxel_rope_base` for a
                window-aware default.
        """
        super().__init__()
        self.patch_size = patch_size
        self.num_heads = num_heads
        assert dim % num_heads == 0, "dim must be divisible by num_heads"
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5
        self.use_batched_qkv = use_batched_qkv

        if use_batched_qkv:
            # Use BatchedLinear for Muon-friendly QKV projection
            self.qkv = BatchedLinear(dim, dim, num_matrices=3, bias=qkv_bias)
        else:
            # Original single linear layer approach
            self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)

        self.order = order
        assert flash_attn is not None, "Make sure flash_attn is installed."
        self.attn_drop_p = attn_drop

        self.use_rope = use_rope
        if use_rope:
            self.rope = VoxelRotaryPositionalEmbeddings(
                dim=dim,
                num_heads=num_heads,
                base=rope_base,
            )

        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def _offset_to_attn_offset(
        self, offsets: Int[Tensor, "B+1"], patch_size: Optional[int] = None
    ) -> Int[Tensor, "B"]:
        """
        Convert offsets to cumulative attention offsets required for flash attention.
        If the patch size is 8 and the offsets are [0, 3, 11, 40] (3 batches),
        the cumulative attention offsets are [0, 3, 3 + 8 = 11, 11 + 8, 11 + 8 + 8, 11 + 8 + 8 + 8, 40].

        Args:
            offsets: (B+1)
            patch_size: Optional[int]
        Returns:
            cum_seqlens: M
        """
        patch_size = patch_size or self.patch_size
        counts = torch.diff(offsets)

        # Calculate number of patches per batch using ceil division
        num_patches_per_batch = (counts + patch_size - 1) // patch_size

        # Fast path: if no patches, return original offsets
        if num_patches_per_batch.sum() == 0:
            return offsets

        # Calculate how many elements each batch contributes (num_patches)
        # We generate the start indices for each patch. The final end point is added later.
        elements_per_batch = num_patches_per_batch

        # Create indices for which batch each element belongs to
        batch_indices = torch.repeat_interleave(
            torch.arange(len(offsets) - 1, device=offsets.device), elements_per_batch
        )

        # Create indices for position within each batch's sequence (0, 1, 2, ...)
        within_batch_indices = torch.cat(
            [
                torch.arange(n, device=offsets.device, dtype=offsets.dtype)
                for n in num_patches_per_batch
            ]
        )

        # Calculate the actual offsets: start_offset + patch_index * patch_size
        start_offsets = offsets[:-1][batch_indices]
        patch_contributions = within_batch_indices * patch_size
        result_middle = start_offsets + patch_contributions

        # Add the final offset
        result = torch.cat([result_middle, offsets[-1].unsqueeze(0)])

        return result.contiguous()

    def forward(self, x: Geometry, order: Optional[POINT_ORDERING] = None) -> Geometry:
        # Assert that x is serialized
        K = self.patch_size

        feats = x.features
        coords = x.coordinate_tensor
        M, C = feats.shape[:2]
        inverse_perm = None
        order = order or self.order
        if not hasattr(x, "order") or (order != x.order):
            # Generate new ordering and inverse permutation
            code_result = encode(
                x.coordinate_tensor,
                batch_offsets=x.offsets,
                order=order,
                return_perm=True,
                return_inverse=True,
            )
            feats = feats[code_result.perm]
            if self.use_rope:
                coords = coords[code_result.perm]
            inverse_perm = code_result.inverse_perm

        if self.use_rope:
            qkv = self.rope(self.qkv(feats), coords)
        else:
            qkv = self.qkv(feats).reshape(M, 3, self.num_heads, C // self.num_heads)
        if qkv.dtype not in [torch.float16, torch.bfloat16]:
            qkv = qkv.to(torch.float16)

        attn_offsets = self._offset_to_attn_offset(x.offsets, K).to(
            device=qkv.device, dtype=torch.int32
        )
        # Warning: When the loss is NaN, this module will fail during backward with
        # index out of bounds error.
        # e.g. /pytorch/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [192,0,0], thread: [32,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "
        # https://discuss.pytorch.org/t/scattergatherkernel-cu-assertion-idx-dim-0-idx-dim-index-size-index-out-of-bounds/195356
        out_feat = flash_attn.flash_attn_varlen_qkvpacked_func(
            qkv,
            attn_offsets,
            max_seqlen=K,
            dropout_p=self.attn_drop_p if self.training else 0.0,
            softmax_scale=self.scale,
        )
        out_feat = out_feat.reshape(M, C).to(feats.dtype)

        out_feat = self.proj(out_feat)
        out_feat = self.proj_drop(out_feat)

        if inverse_perm is not None:
            out_feat = out_feat[inverse_perm]

        return x.replace(batched_features=out_feat.to(feats.dtype))

`offset_to_mask(x: Float[Tensor, 'B M C'], offsets: Float[Tensor, B + 1], max_num_points: int, dtype: torch.dtype = torch.bool) -> Float[Tensor, 'B 1 M M']` ¶

Create a mask for the points in the batch.

Source code in warpconvnet/nn/modules/attention.py

def offset_to_mask(
    x: Float[Tensor, "B M C"],  # noqa: F821
    offsets: Float[Tensor, "B+1"],  # noqa: F821
    max_num_points: int,  # noqa: F821
    dtype: torch.dtype = torch.bool,
) -> Float[Tensor, "B 1 M M"]:  # noqa: F821
    """
    Create a mask for the points in the batch.
    """
    B = x.shape[0]
    assert B == offsets.shape[0] - 1
    mask = torch.zeros(
        (B, 1, max_num_points, max_num_points),
        dtype=dtype,
        device=x.device,
    )
    num_points = offsets.diff()
    if dtype == torch.bool:
        for b in range(B):
            # mask[b, :, : num_points[b], : num_points[b]] = True
            mask[b, :, :, : num_points[b]] = True
    else:
        raise ValueError(f"Unsupported dtype: {dtype}")
    return mask

`zero_out_points(x: Float[Tensor, 'B N C'], num_points: Int[Tensor, B]) -> Float[Tensor, 'B N C']` ¶

Zero out the points in the batch.

Source code in warpconvnet/nn/modules/attention.py

def zero_out_points(
    x: Float[Tensor, "B N C"], num_points: Int[Tensor, "B"]  # noqa: F821
) -> Float[Tensor, "B N C"]:  # noqa: F821
    """
    Zero out the points in the batch.
    """
    for b in range(num_points.shape[0]):
        x[b, num_points[b] :] = 0
    return x

Base module¶

`warpconvnet.nn.modules.base_module` ¶

`BaseSpatialModel` ¶

Bases: BaseSpatialModule

Base model class.

Source code in warpconvnet/nn/modules/base_module.py

class BaseSpatialModel(BaseSpatialModule):
    """Base model class."""

    def data_dict_to_input(self, data_dict, **kwargs) -> Any:
        """Convert data dictionary to appropriate input for the model."""
        raise NotImplementedError

    def loss_dict(self, data_dict, **kwargs) -> Dict:
        """Compute the loss dictionary for the model."""
        raise NotImplementedError

    @torch.no_grad()
    def eval_dict(self, data_dict, **kwargs) -> Dict:
        """Compute the evaluation dictionary for the model."""
        raise NotImplementedError

    def image_pointcloud_dict(self, data_dict, datamodule) -> Tuple[Dict, Dict]:
        """Compute the image dict and pointcloud dict for the model."""
        raise NotImplementedError

`data_dict_to_input(data_dict, **kwargs) -> Any` ¶

Convert data dictionary to appropriate input for the model.

Source code in warpconvnet/nn/modules/base_module.py

def data_dict_to_input(self, data_dict, **kwargs) -> Any:
    """Convert data dictionary to appropriate input for the model."""
    raise NotImplementedError

`eval_dict(data_dict, **kwargs) -> Dict` ¶

Compute the evaluation dictionary for the model.

Source code in warpconvnet/nn/modules/base_module.py

@torch.no_grad()
def eval_dict(self, data_dict, **kwargs) -> Dict:
    """Compute the evaluation dictionary for the model."""
    raise NotImplementedError

`image_pointcloud_dict(data_dict, datamodule) -> Tuple[Dict, Dict]` ¶

Compute the image dict and pointcloud dict for the model.

Source code in warpconvnet/nn/modules/base_module.py

def image_pointcloud_dict(self, data_dict, datamodule) -> Tuple[Dict, Dict]:
    """Compute the image dict and pointcloud dict for the model."""
    raise NotImplementedError

`loss_dict(data_dict, **kwargs) -> Dict` ¶

Compute the loss dictionary for the model.

Source code in warpconvnet/nn/modules/base_module.py

def loss_dict(self, data_dict, **kwargs) -> Dict:
    """Compute the loss dictionary for the model."""
    raise NotImplementedError

`BaseSpatialModule` ¶

Bases: Module

Base module for spatial features. The input must be an instance of BatchedSpatialFeatures.

Source code in warpconvnet/nn/modules/base_module.py

class BaseSpatialModule(nn.Module):
    """Base module for spatial features. The input must be an instance of `BatchedSpatialFeatures`."""

    @property
    def device(self):
        """Returns the device that the model is on."""
        return next(self.parameters()).device

    def forward(self, x: Geometry):
        """Forward pass."""
        raise NotImplementedError

`device` `property` ¶

Returns the device that the model is on.

`forward(x: Geometry)` ¶

Forward pass.

Source code in warpconvnet/nn/modules/base_module.py

def forward(self, x: Geometry):
    """Forward pass."""
    raise NotImplementedError

Factor grid¶

`warpconvnet.nn.modules.factor_grid` ¶

Neural network modules for FactorGrid operations.

This module provides neural network layers and operations specifically designed for working with FactorGrid geometries in the FIGConvNet architecture.