[gpt-oss] Add gpt-oss mxfp4 support

2025-08-25 17:41:34 +08:00
parent db7f48eeac
commit ce688181e6
33 changed files with 4835 additions and 1192 deletions
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -110,7 +110,6 @@ class PallasAttentionBackendImpl(AttentionImpl):
        alibi_slopes: Optional[list[float]],
        sliding_window: Optional[int],
        kv_cache_dtype: str,
-        blocksparse_params: Optional[dict[str, Any]] = None,
        logits_soft_cap: Optional[float] = None,
        attn_type: str = AttentionType.DECODER,
        kv_sharing_target_layer_name: Optional[int] = None,
@@ -120,9 +119,6 @@ class PallasAttentionBackendImpl(AttentionImpl):
            logger.warning_once(
                "Using irope in Pallas is not supported yet, it will fall back "
                "to global attention for long context.")
-        if blocksparse_params is not None:
-            raise ValueError("Paged attention Pallas kernel does "
-                             "not support block-sparse attention.")
        self.num_heads = num_heads
        self.head_size = head_size
        self.scale = float(scale)
@@ -139,8 +135,6 @@ class PallasAttentionBackendImpl(AttentionImpl):
            raise NotImplementedError("Alibi slopes is not supported.")
        if kv_cache_dtype != "auto":
            raise NotImplementedError("FP8 KV cache dtype is not supported.")
-        if blocksparse_params is not None:
-            raise NotImplementedError("Blocksparse is not supported.")

        if attn_type != AttentionType.DECODER:
            raise NotImplementedError("Encoder self-attention and "
@@ -161,6 +155,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
        kv_cache: torch.Tensor,
        attn_metadata: PallasMetadata,
        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        """Forward pass with Pallas attention.

@@ -173,6 +168,11 @@ class PallasAttentionBackendImpl(AttentionImpl):
        Returns:
            shape = [num_tokens, num_heads * head_size]
        """
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for PallasAttentionBackendImpl")
+
        # For determine_available_memory case.
        if kv_cache.numel() == 0:
            if output is None: