Support nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8/NVFP4 (#11866)

2025-10-23 12:29:02 +03:00
parent 36a4cad7b0
commit d6fee73d1f
10 changed files with 207 additions and 127 deletions
--- a/test/srt/layers/attention/mamba/test_causal_conv1d.py
+++ b/test/srt/layers/attention/mamba/test_causal_conv1d.py
@@ -373,3 +373,7 @@ def test_causal_conv1d_varlen(
    )
    unpadded_out = out[:, : out_ref_tensor.shape[-1]]
    assert torch.allclose(unpadded_out, out_ref_tensor, rtol=rtol, atol=atol)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
--- a/test/srt/layers/attention/mamba/test_mamba2_mixer.py
+++ b/test/srt/layers/attention/mamba/test_mamba2_mixer.py
@@ -1,5 +1,6 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/2c58742dff8613a3bd7496f2008ce927e18d38d1/tests/kernels/mamba/test_mamba_mixer2.py

+
 from unittest.mock import patch

 import pytest
@@ -136,3 +137,7 @@ def mixer2_gated_norm_tensor_parallel(
        atol=5e-3,
        rtol=1e-3,
    )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
--- a/test/srt/layers/attention/mamba/test_mamba_ssm.py
+++ b/test/srt/layers/attention/mamba/test_mamba_ssm.py
@@ -1,5 +1,6 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/633f943e30a4444d890d26b81850f7217736f840/tests/kernels/mamba/test_mamba_ssm_ssd.py

+
 import pytest
 import torch
 import torch.nn.functional as F
@@ -289,3 +290,7 @@ def test_selective_state_update_with_heads_with_batch_indices(
    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
    assert torch.allclose(state[state_indices, :], state_ref, rtol=rtol, atol=atol)
    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
--- a/test/srt/layers/attention/mamba/test_mamba_ssm_ssd.py
+++ b/test/srt/layers/attention/mamba/test_mamba_ssm_ssd.py
@@ -8,13 +8,12 @@ from einops import rearrange, repeat

 from sglang.srt.layers.attention.mamba.mamba2_metadata import Mamba2Metadata
 from sglang.srt.layers.attention.mamba.ops import mamba_chunk_scan_combined
+from sglang.utils import is_in_ci

 # Added by the IBM Team, 2024

 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/modules/ssd_minimal.py

-# TODO: These take a long time to run - we should cut down on some of the parameterized matrix.
-

 # this is the segsum implementation taken from above
 def segsum(x):
@@ -191,10 +190,22 @@ def generate_continuous_batched_examples(
        )


-@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("n_heads", [3, 4, 11, 16, 32])
-@pytest.mark.parametrize("d_head", [5, 8, 19, 32, 128])
-@pytest.mark.parametrize("seq_len_chunk_size", [(112, 16), (128, 32)])
+SINGLE_ITYPE = [torch.float32, torch.float16, torch.bfloat16]
+SINGLE_NHEADS = [3, 4, 11, 16, 32]
+SINGLE_DHEAD = [5, 8, 19, 32, 128]
+SINGLE_SEQ_LEN_CHUNK_SIZE = [(112, 16), (128, 32)]
+
+if is_in_ci():
+    SINGLE_ITYPE = [torch.float32, torch.bfloat16]
+    SINGLE_NHEADS = [3, 32]
+    SINGLE_DHEAD = [5, 128]
+    SINGLE_SEQ_LEN_CHUNK_SIZE = [(112, 16)]
+
+
+@pytest.mark.parametrize("itype", SINGLE_ITYPE)
+@pytest.mark.parametrize("n_heads", SINGLE_NHEADS)
+@pytest.mark.parametrize("d_head", SINGLE_DHEAD)
+@pytest.mark.parametrize("seq_len_chunk_size", SINGLE_SEQ_LEN_CHUNK_SIZE)
 def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, itype):
    if not torch.cuda.is_available():
        pytest.skip("CUDA device not available")
@@ -238,9 +249,19 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, it
    )


-@pytest.mark.parametrize("itype", [torch.float32, torch.float16])
-@pytest.mark.parametrize("n_heads", [4, 8, 13])
-@pytest.mark.parametrize("d_head", [5, 16, 21, 32])
+BATCHED_ITYPE = [torch.float32, torch.float16]
+BATCHED_NHEADS = [4, 8, 13]
+BATCHED_DHEAD = [5, 16, 21, 32]
+
+if is_in_ci():
+    BATCHED_ITYPE = [torch.float32]
+    BATCHED_NHEADS = [4, 13]
+    BATCHED_DHEAD = [5, 32]
+
+
+@pytest.mark.parametrize("itype", BATCHED_ITYPE)
+@pytest.mark.parametrize("n_heads", BATCHED_NHEADS)
+@pytest.mark.parametrize("d_head", BATCHED_DHEAD)
@pytest.mark.parametrize(
    "seq_len_chunk_size_cases",
    [
@@ -579,3 +600,7 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
            rtol=rtol,
            msg=lambda x: f"seq{i} state " + x,
        )  # noqa: B023
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])