Support nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8/NVFP4 (#11866)
This commit is contained in:
@@ -373,3 +373,7 @@ def test_causal_conv1d_varlen(
|
||||
)
|
||||
unpadded_out = out[:, : out_ref_tensor.shape[-1]]
|
||||
assert torch.allclose(unpadded_out, out_ref_tensor, rtol=rtol, atol=atol)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__])
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# Adapted from https://github.com/vllm-project/vllm/blob/2c58742dff8613a3bd7496f2008ce927e18d38d1/tests/kernels/mamba/test_mamba_mixer2.py
|
||||
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
@@ -136,3 +137,7 @@ def mixer2_gated_norm_tensor_parallel(
|
||||
atol=5e-3,
|
||||
rtol=1e-3,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__])
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# Adapted from https://github.com/vllm-project/vllm/blob/633f943e30a4444d890d26b81850f7217736f840/tests/kernels/mamba/test_mamba_ssm_ssd.py
|
||||
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
@@ -289,3 +290,7 @@ def test_selective_state_update_with_heads_with_batch_indices(
|
||||
print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
|
||||
assert torch.allclose(state[state_indices, :], state_ref, rtol=rtol, atol=atol)
|
||||
assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__])
|
||||
|
||||
@@ -8,13 +8,12 @@ from einops import rearrange, repeat
|
||||
|
||||
from sglang.srt.layers.attention.mamba.mamba2_metadata import Mamba2Metadata
|
||||
from sglang.srt.layers.attention.mamba.ops import mamba_chunk_scan_combined
|
||||
from sglang.utils import is_in_ci
|
||||
|
||||
# Added by the IBM Team, 2024
|
||||
|
||||
# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/modules/ssd_minimal.py
|
||||
|
||||
# TODO: These take a long time to run - we should cut down on some of the parameterized matrix.
|
||||
|
||||
|
||||
# this is the segsum implementation taken from above
|
||||
def segsum(x):
|
||||
@@ -191,10 +190,22 @@ def generate_continuous_batched_examples(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("n_heads", [3, 4, 11, 16, 32])
|
||||
@pytest.mark.parametrize("d_head", [5, 8, 19, 32, 128])
|
||||
@pytest.mark.parametrize("seq_len_chunk_size", [(112, 16), (128, 32)])
|
||||
SINGLE_ITYPE = [torch.float32, torch.float16, torch.bfloat16]
|
||||
SINGLE_NHEADS = [3, 4, 11, 16, 32]
|
||||
SINGLE_DHEAD = [5, 8, 19, 32, 128]
|
||||
SINGLE_SEQ_LEN_CHUNK_SIZE = [(112, 16), (128, 32)]
|
||||
|
||||
if is_in_ci():
|
||||
SINGLE_ITYPE = [torch.float32, torch.bfloat16]
|
||||
SINGLE_NHEADS = [3, 32]
|
||||
SINGLE_DHEAD = [5, 128]
|
||||
SINGLE_SEQ_LEN_CHUNK_SIZE = [(112, 16)]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype", SINGLE_ITYPE)
|
||||
@pytest.mark.parametrize("n_heads", SINGLE_NHEADS)
|
||||
@pytest.mark.parametrize("d_head", SINGLE_DHEAD)
|
||||
@pytest.mark.parametrize("seq_len_chunk_size", SINGLE_SEQ_LEN_CHUNK_SIZE)
|
||||
def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, itype):
|
||||
if not torch.cuda.is_available():
|
||||
pytest.skip("CUDA device not available")
|
||||
@@ -238,9 +249,19 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, it
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype", [torch.float32, torch.float16])
|
||||
@pytest.mark.parametrize("n_heads", [4, 8, 13])
|
||||
@pytest.mark.parametrize("d_head", [5, 16, 21, 32])
|
||||
BATCHED_ITYPE = [torch.float32, torch.float16]
|
||||
BATCHED_NHEADS = [4, 8, 13]
|
||||
BATCHED_DHEAD = [5, 16, 21, 32]
|
||||
|
||||
if is_in_ci():
|
||||
BATCHED_ITYPE = [torch.float32]
|
||||
BATCHED_NHEADS = [4, 13]
|
||||
BATCHED_DHEAD = [5, 32]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype", BATCHED_ITYPE)
|
||||
@pytest.mark.parametrize("n_heads", BATCHED_NHEADS)
|
||||
@pytest.mark.parametrize("d_head", BATCHED_DHEAD)
|
||||
@pytest.mark.parametrize(
|
||||
"seq_len_chunk_size_cases",
|
||||
[
|
||||
@@ -579,3 +600,7 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
rtol=rtol,
|
||||
msg=lambda x: f"seq{i} state " + x,
|
||||
) # noqa: B023
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__])
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import unittest
|
||||
from types import SimpleNamespace
|
||||
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.srt.utils import is_blackwell, kill_process_tree
|
||||
from sglang.test.few_shot_gsm8k import run_eval
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
@@ -12,9 +12,11 @@ from sglang.test.test_utils import (
|
||||
|
||||
|
||||
class TestNvidiaNemotronNanoV2(CustomTestCase):
|
||||
model = "nvidia/NVIDIA-Nemotron-Nano-9B-v2"
|
||||
accuracy = 0.87
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = "nvidia/NVIDIA-Nemotron-Nano-9B-v2"
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
cls.process = popen_launch_server(
|
||||
cls.model,
|
||||
@@ -42,7 +44,18 @@ class TestNvidiaNemotronNanoV2(CustomTestCase):
|
||||
)
|
||||
metrics = run_eval(args)
|
||||
print(f"{metrics=}")
|
||||
self.assertGreater(metrics["accuracy"], 0.87)
|
||||
self.assertGreaterEqual(metrics["accuracy"], self.accuracy)
|
||||
|
||||
|
||||
class TestNvidiaNemotronNanoV2FP8(TestNvidiaNemotronNanoV2):
|
||||
accuracy = 0.87
|
||||
model = "nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8"
|
||||
|
||||
|
||||
@unittest.skipIf(not is_blackwell(), "NVFP4 only supported on blackwell")
|
||||
class TestNvidiaNemotronNanoV2NVFP4(TestNvidiaNemotronNanoV2):
|
||||
accuracy = 0.855
|
||||
model = "nvidia/NVIDIA-Nemotron-Nano-9B-v2-NVFP4"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -19,6 +19,9 @@ suites = {
|
||||
TestFile("hicache/test_hicache_eagle.py", 150),
|
||||
TestFile("hicache/test_hicache_mla.py", 127),
|
||||
TestFile("hicache/test_hicache_storage.py", 127),
|
||||
TestFile("layers/attention/mamba/test_causal_conv1d.py", 25),
|
||||
TestFile("layers/attention/mamba/test_mamba_ssm.py", 50),
|
||||
TestFile("layers/attention/mamba/test_mamba_ssm_ssd.py", 70),
|
||||
TestFile("lora/test_lora.py", 200),
|
||||
TestFile("lora/test_lora_eviction.py", 200),
|
||||
TestFile("lora/test_lora_eviction_policy.py", 200),
|
||||
@@ -34,7 +37,7 @@ suites = {
|
||||
TestFile("models/test_embedding_models.py", 73),
|
||||
TestFile("models/test_encoder_embedding_models.py", 460),
|
||||
TestFile("models/test_generation_models.py", 103),
|
||||
TestFile("models/test_nvidia_nemotron_nano_v2.py", 180),
|
||||
TestFile("models/test_nvidia_nemotron_nano_v2.py", 300),
|
||||
TestFile("models/test_qwen_models.py", 82),
|
||||
TestFile("batch_invariant/test_batch_invariant_ops.py", 10),
|
||||
TestFile("models/test_reward_models.py", 132),
|
||||
@@ -143,7 +146,7 @@ suites = {
|
||||
TestFile("hicache/test_hicache_storage_3fs_backend.py", 200),
|
||||
TestFile("hicache/test_hicache_storage_file_backend.py", 200),
|
||||
TestFile("hicache/test_hicache_storage_mooncake_backend.py", 400),
|
||||
TestFile("layers/attention/mamba/test_mamba2_mixer.py", 110),
|
||||
TestFile("layers/attention/mamba/test_mamba2_mixer.py", 50),
|
||||
TestFile("lora/test_lora_tp.py", 116),
|
||||
TestFile("models/test_glm4_moe_models.py", 100),
|
||||
TestFile("rl/test_update_weights_from_distributed.py", 103),
|
||||
|
||||
Reference in New Issue
Block a user