From 6c18addb6f530c1d1c7b53b586e89294a6391712 Mon Sep 17 00:00:00 2001
From: Liangsheng Yin <hnyls2002@gmail.com>
Date: Thu, 23 Oct 2025 21:27:58 +0800
Subject: [PATCH] Revert "Support nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8/NVFP4"
 (#12015)

---
 .../srt/layers/quantization/modelopt_quant.py | 133 ++++++++----------
 python/sglang/srt/model_loader/loader.py      |   6 +-
 .../sglang/srt/model_loader/weight_utils.py   |  71 ++++------
 python/sglang/srt/models/nemotron_h.py        |  41 +++---
 .../attention/mamba/test_causal_conv1d.py     |   4 -
 .../attention/mamba/test_mamba2_mixer.py      |   5 -
 .../layers/attention/mamba/test_mamba_ssm.py  |   5 -
 .../attention/mamba/test_mamba_ssm_ssd.py     |  43 ++----
 .../models/test_nvidia_nemotron_nano_v2.py    |  19 +--
 test/srt/run_suite.py                         |   7 +-
 10 files changed, 127 insertions(+), 207 deletions(-)

diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py
index 634fb121f..949d63450 100755
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -90,50 +90,7 @@ CUTEDSL_MOE_NVFP4_DISPATCH = get_bool_env_var(
 ACTIVATION_SCHEMES = ["static"]
 
 
-class ModelOptQuantConfig(QuantizationConfig):
-    def __init__(
-        self,
-        kv_cache_quant_algo: Optional[str],
-        exclude_modules: Optional[List[str]],
-        packed_modules_mapping: Optional[Dict[str, List[str]]],
-    ):
-        super().__init__()
-        self.packed_modules_mapping = packed_modules_mapping
-        self.exclude_modules = exclude_modules or []
-        self.kv_cache_quant_algo = kv_cache_quant_algo
-
-    def _get_quant_method(
-        self,
-        layer: torch.nn.Module,
-        prefix: str,
-        *,
-        Linear: type[LinearMethodBase],
-        Moe: type[FusedMoEMethodBase],
-    ) -> Optional[QuantizeMethodBase]:
-        from sglang.srt.layers.linear import LinearBase
-        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
-
-        if isinstance(layer, LinearBase):
-            if is_layer_skipped(
-                prefix, self.exclude_modules, self.packed_modules_mapping
-            ) or self.is_layer_excluded(prefix):
-                return UnquantizedLinearMethod()
-            return Linear(self)
-        elif self.kv_cache_quant_algo and isinstance(layer, RadixAttention):
-            return ModelOptFp8KVCacheMethod(self)
-        elif isinstance(layer, FusedMoE):
-            return Moe(self)
-        return None
-
-    @classmethod
-    def get_config_filenames(cls) -> List[str]:
-        return ["hf_quant_config.json"]
-
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
-
-class ModelOptFp8Config(ModelOptQuantConfig):
+class ModelOptFp8Config(QuantizationConfig):
     """Configuration for ModelOpt FP8 quantization, including serialization and compatibility checks."""
 
     def __init__(
@@ -141,14 +98,14 @@ class ModelOptFp8Config(ModelOptQuantConfig):
         is_checkpoint_fp8_serialized: bool = False,
         kv_cache_quant_method: Optional[str] = None,
         exclude_modules: Optional[List[str]] = None,
-        packed_modules_mapping: Optional[Dict[str, List[str]]] = None,
     ) -> None:
         """
         Args:
             is_checkpoint_fp8_serialized (bool): Indicates if the checkpoint uses serialized FP8 format.
         """
-        super().__init__(kv_cache_quant_method, exclude_modules, packed_modules_mapping)
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        self.kv_cache_quant_method = kv_cache_quant_method
+        self.exclude_modules = exclude_modules
         if is_checkpoint_fp8_serialized:
             logger.warning(
                 "Detected ModelOpt FP8 checkpoint. The format is experimental and subject to change."
@@ -171,6 +128,10 @@ class ModelOptFp8Config(ModelOptQuantConfig):
     def get_min_capability(cls) -> int:
         return 89  # Minimum hardware capability (e.g., Hopper GPUs).
 
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["hf_quant_config.json"]
+
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> ModelOptFp8Config:
         # Handle two different config formats:
@@ -225,27 +186,37 @@ class ModelOptFp8Config(ModelOptQuantConfig):
             is_checkpoint_fp8_serialized=True,
             kv_cache_quant_method=kv_cache_quant_method,
             exclude_modules=exclude_modules,
-            packed_modules_mapping=config.get("packed_modules_mapping"),
         )
 
-    def is_layer_excluded(self, prefix: str) -> bool:
-        if len(self.exclude_modules) == 0:
-            return False
-        return any(
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if self.exclude_modules and any(
             module in prefix
             or (
                 prefix.startswith("language_model.")
                 and module in prefix.removeprefix("language_model.")
             )
             for module in self.exclude_modules
-        )
+        ):
+            return None
 
-    def get_quant_method(
-        self, layer: torch.nn.Module, prefix: str
-    ) -> Optional[QuantizeMethodBase]:
-        return self._get_quant_method(
-            layer, prefix, Linear=ModelOptFp8LinearMethod, Moe=ModelOptFp8MoEMethod
-        )
+        if isinstance(layer, LinearBase):
+            return ModelOptFp8LinearMethod(self)
+        if self.kv_cache_quant_method and isinstance(layer, RadixAttention):
+            return ModelOptFp8KVCacheMethod(self)
+
+        if isinstance(layer, FusedMoE):
+            return ModelOptFp8MoEMethod(self)
+
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
 
 
 class ModelOptFp8LinearMethod(LinearMethodBase):
@@ -541,7 +512,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         return self.runner.run(dispatch_output, quant_info)
 
 
-class ModelOptFp4Config(ModelOptQuantConfig):
+class ModelOptFp4Config(QuantizationConfig):
     """Config class for FP4."""
 
     def __init__(
@@ -550,9 +521,7 @@ class ModelOptFp4Config(ModelOptQuantConfig):
         kv_cache_quant_algo: str = None,
         group_size: int = None,
         exclude_modules: List[str] = None,
-        packed_modules_mapping: Optional[Dict[str, List[str]]] = None,
     ) -> None:
-        super().__init__(kv_cache_quant_algo, exclude_modules, packed_modules_mapping)
         self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
         if is_checkpoint_nvfp4_serialized:
             logger.warning(
@@ -560,6 +529,8 @@ class ModelOptFp4Config(ModelOptQuantConfig):
                 "format is experimental and subject to change."
             )
         self.group_size = group_size
+        self.kv_cache_quant_algo = kv_cache_quant_algo
+        self.exclude_modules = exclude_modules
 
     @classmethod
     def override_quantization_method(cls, hf_quant_config, user_quant):
@@ -578,6 +549,10 @@ class ModelOptFp4Config(ModelOptQuantConfig):
     def get_min_capability(cls) -> int:
         return 100
 
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["hf_quant_config.json"]
+
     @staticmethod
     def common_group_size(cfg: dict) -> int:
         """Return the unique group_size across the config; raise if missing/mismatched."""
@@ -693,15 +668,14 @@ class ModelOptFp4Config(ModelOptQuantConfig):
             kv_cache_quant_algo,
             group_size,
             exclude_modules,
-            config.get("packed_modules_mapping"),
         )
 
-    def is_layer_excluded(self, prefix: str):
+    def is_layer_excluded(self, prefix: str, exclude_modules: list):
         import regex as re
 
         fused_patterns = ["q_a_proj", "q_b_proj", "kv_a_proj_with_mqa", "kv_b_proj"]
         prefix_split = prefix.split(".")
-        for pattern in self.exclude_modules:
+        for pattern in exclude_modules:
             regex_str = pattern.replace(".", r"\.").replace("*", r".*")
             pattern_split = pattern.split(".")
             if re.fullmatch(regex_str, prefix):
@@ -717,17 +691,30 @@ class ModelOptFp4Config(ModelOptQuantConfig):
                 return True
         return False
 
-    def get_quant_method(self, layer: torch.nn.Module, prefix: str):
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
         from sglang.srt.layers.moe.fused_moe_triton.layer import FlashInferFP4MoE
 
-        Moe = (
-            FlashInferFP4MoE  # FlashInferFP4MoE needs the same quantization method but with compatible attribute handling
-            if isinstance(layer, FlashInferFP4MoE)
-            else ModelOptNvFp4FusedMoEMethod
-        )
-        return self._get_quant_method(
-            layer, prefix, Linear=ModelOptFp4LinearMethod, Moe=Moe
-        )
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.exclude_modules) or self.is_layer_excluded(
+                prefix, self.exclude_modules
+            ):
+                return UnquantizedLinearMethod()
+            return ModelOptFp4LinearMethod(self)
+        if self.kv_cache_quant_algo and isinstance(layer, RadixAttention):
+            return ModelOptFp8KVCacheMethod(self)
+        elif isinstance(layer, FlashInferFP4MoE):
+            # FlashInferFP4MoE needs the same quantization method but with compatible attribute handling
+            return ModelOptNvFp4FusedMoEMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return ModelOptNvFp4FusedMoEMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
 
 
 class ModelOptFp4LinearMethod(LinearMethodBase):
diff --git a/python/sglang/srt/model_loader/loader.py b/python/sglang/srt/model_loader/loader.py
index 06ecb5041..6134f24ba 100644
--- a/python/sglang/srt/model_loader/loader.py
+++ b/python/sglang/srt/model_loader/loader.py
@@ -180,12 +180,11 @@ def _get_quantization_config(
     model_config: ModelConfig,
     load_config: LoadConfig,
     packed_modules_mapping: Dict[str, List[str]],
-    remap_prefix: Dict[str, str] | None = None,
 ) -> Optional[QuantizationConfig]:
     """Get the quantization config."""
     if model_config.quantization is not None:
         quant_config = get_quant_config(
-            model_config, load_config, packed_modules_mapping, remap_prefix
+            model_config, load_config, packed_modules_mapping
         )
         # (yizhang2077) workaround for nvidia/Llama-4-Maverick-17B-128E-Eagle3
         if quant_config is None:
@@ -221,7 +220,6 @@ def _initialize_model(
     """Initialize a model with the given configurations."""
     model_class, _ = get_model_architecture(model_config)
     packed_modules_mapping = getattr(model_class, "packed_modules_mapping", {})
-    remap_prefix = getattr(model_class, "remap_prefix", None)
     if _is_npu:
         packed_modules_mapping.update(
             {
@@ -245,7 +243,7 @@ def _initialize_model(
         )
 
     quant_config = _get_quantization_config(
-        model_config, load_config, packed_modules_mapping, remap_prefix
+        model_config, load_config, packed_modules_mapping
     )
 
     # Build kwargs conditionally
diff --git a/python/sglang/srt/model_loader/weight_utils.py b/python/sglang/srt/model_loader/weight_utils.py
index 7edd0bbe0..d4585bbb3 100644
--- a/python/sglang/srt/model_loader/weight_utils.py
+++ b/python/sglang/srt/model_loader/weight_utils.py
@@ -37,10 +37,7 @@ from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.distributed import get_tensor_model_parallel_rank
 from sglang.srt.layers.dp_attention import get_attention_tp_rank
 from sglang.srt.layers.quantization import QuantizationConfig, get_quantization_config
-from sglang.srt.layers.quantization.modelopt_quant import (
-    ModelOptFp4Config,
-    ModelOptFp8Config,
-)
+from sglang.srt.layers.quantization.modelopt_quant import ModelOptFp4Config
 from sglang.srt.utils import find_local_repo_dir, log_info_on_rank0, print_warning_once
 from sglang.utils import is_in_ci
 
@@ -138,26 +135,11 @@ def convert_bin_to_safetensor_file(
             raise RuntimeError(f"The output tensors do not match for key {k}")
 
 
-def replace_prefix(key: str, prefix_mapping: dict[str, str]) -> str:
-    for prefix, new_prefix in prefix_mapping.items():
-        if key.startswith(prefix):
-            key = key.replace(prefix, new_prefix, 1)
-    return key
-
-
-def replace_substrings(key: str, substring_mapping: dict[str, str]) -> str:
-    for substr, new_substr in substring_mapping.items():
-        if substr in key:
-            key = key.replace(substr, new_substr)
-    return key
-
-
 # TODO(woosuk): Move this to other place.
 def get_quant_config(
     model_config: ModelConfig,
     load_config: LoadConfig,
     packed_modules_mapping: Dict[str, List[str]],
-    remap_prefix: Dict[str, str] | None = None,
 ) -> QuantizationConfig:
     quant_cls = get_quantization_config(model_config.quantization)
 
@@ -227,33 +209,38 @@ def get_quant_config(
     quant_config_file = quant_config_files[0]
     with open(quant_config_file) as f:
         config = json.load(f)
-        if remap_prefix is not None:
-            exclude_modules = [
-                replace_prefix(key, remap_prefix)
-                for key in config["quantization"]["exclude_modules"]
-            ]
-            config["quantization"]["exclude_modules"] = exclude_modules
-        config["packed_modules_mapping"] = packed_modules_mapping
 
         if model_config.quantization == "bitsandbytes":
             config["adapter_name_or_path"] = model_name_or_path
-        elif model_config.quantization.startswith("modelopt") and (
-            config["producer"]["name"].startswith("modelopt")
-        ):
-            quant_algo = config["quantization"]["quant_algo"]
-            if quant_algo is None:
+        elif model_config.quantization == "modelopt":
+            if config["producer"]["name"] == "modelopt":
                 # (yizhang2077) workaround for nvidia/Llama-4-Maverick-17B-128E-Eagle3
-                if model_config.hf_config.architectures[0] != "LlamaForCausalLMEagle3":
-                    raise ValueError(
-                        f"Invalid quant_config, quantization method: {model_config.quantization},"
-                        f"hf architectures: {model_config.hf_config.architectures[0]}. "
-                    )
-                return None
-            elif quant_algo == "FP8" or model_config.quantization == "modelopt_fp8":
-                return ModelOptFp8Config.from_config(config)
-            elif "FP4" in quant_algo:
-                return ModelOptFp4Config.from_config(config)
-        return quant_cls.from_config(config)
+                if config["quantization"]["quant_algo"] is None:
+                    if (
+                        model_config.hf_config.architectures[0]
+                        != "LlamaForCausalLMEagle3"
+                    ):
+                        raise ValueError(
+                            f"Invalid quant_config, quantization method: {model_config.quantization},"
+                            f"hf architectures: {model_config.hf_config.architectures[0]}. "
+                        )
+                    return None
+                if "FP4" in config["quantization"]["quant_algo"]:
+                    return ModelOptFp4Config.from_config(config)
+                else:
+                    return quant_cls.from_config(config)
+        elif model_config.quantization == "modelopt_fp8":
+            if config["producer"]["name"] == "modelopt_fp8":
+                return quant_cls.from_config(config)
+            else:
+                raise ValueError(
+                    f"Unsupported quantization config"
+                    f" found for {model_config.quantization} in {f}."
+                )
+        elif model_config.quantization == "w8a8_int8":
+            config["packed_modules_mapping"] = packed_modules_mapping
+
+    return quant_cls.from_config(config)
 
 
 def find_local_hf_snapshot_dir(
diff --git a/python/sglang/srt/models/nemotron_h.py b/python/sglang/srt/models/nemotron_h.py
index eadff130f..9f0126c3f 100644
--- a/python/sglang/srt/models/nemotron_h.py
+++ b/python/sglang/srt/models/nemotron_h.py
@@ -48,8 +48,6 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTe
 from sglang.srt.model_loader.weight_utils import (
     default_weight_loader,
     maybe_remap_kv_scale_name,
-    replace_prefix,
-    replace_substrings,
 )
 from sglang.srt.utils import add_prefix, make_layers_non_pp
 from sglang.utils import logger
@@ -157,7 +155,6 @@ class NemotronHMambaDecoderLayer(nn.Module):
             rms_norm_eps=config.rms_norm_eps,
             activation=config.mamba_hidden_act,
             quant_config=quant_config,
-            prefix=f"{prefix}.mixer",
         )
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -384,19 +381,16 @@ class NemotronHModel(nn.Module):
 
 
 class NemotronHForCausalLM(nn.Module):
-    stacked_params_mapping = [
-        # (param_name, shard_name, shard_id)
-        ("qkv_proj", "q_proj", "q"),
-        ("qkv_proj", "k_proj", "k"),
-        ("qkv_proj", "v_proj", "v"),
-    ]
-    packed_modules_mapping = {
-        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-    }
-
     remap_prefix = {"backbone": "model"}
     remap_substr = {"A_log": "A", "embeddings": "embed_tokens"}
 
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
     def __init__(
         self,
         *,
@@ -438,9 +432,7 @@ class NemotronHForCausalLM(nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ):
-        return NemotronHModel(
-            config=config, quant_config=quant_config, prefix=add_prefix("model", prefix)
-        )
+        return NemotronHModel(config=config, quant_config=quant_config, prefix=prefix)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
@@ -468,10 +460,21 @@ class NemotronHForCausalLM(nn.Module):
         return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> None:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
         updated_weights = []
         for name, loaded_weight in weights:
-            name = replace_prefix(name, self.remap_prefix)
-            name = replace_substrings(name, self.remap_substr)
+            for prefix, new_key in self.remap_prefix.items():
+                if name.startswith(prefix):
+                    name = name.replace(prefix, new_key)
+            for substr, new_key in self.remap_substr.items():
+                if substr in name:
+                    name = name.replace(substr, new_key)
             updated_weights.append((name, loaded_weight))
         params_dict = dict(self.named_parameters())
 
@@ -481,7 +484,7 @@ class NemotronHForCausalLM(nn.Module):
                 if name is None:
                     continue
 
-            for param_name, weight_name, shard_id in self.stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
diff --git a/test/srt/layers/attention/mamba/test_causal_conv1d.py b/test/srt/layers/attention/mamba/test_causal_conv1d.py
index dd1a9a25f..c56b96b4f 100644
--- a/test/srt/layers/attention/mamba/test_causal_conv1d.py
+++ b/test/srt/layers/attention/mamba/test_causal_conv1d.py
@@ -373,7 +373,3 @@ def test_causal_conv1d_varlen(
     )
     unpadded_out = out[:, : out_ref_tensor.shape[-1]]
     assert torch.allclose(unpadded_out, out_ref_tensor, rtol=rtol, atol=atol)
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/test/srt/layers/attention/mamba/test_mamba2_mixer.py b/test/srt/layers/attention/mamba/test_mamba2_mixer.py
index 2252db653..aae477db5 100644
--- a/test/srt/layers/attention/mamba/test_mamba2_mixer.py
+++ b/test/srt/layers/attention/mamba/test_mamba2_mixer.py
@@ -1,6 +1,5 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/2c58742dff8613a3bd7496f2008ce927e18d38d1/tests/kernels/mamba/test_mamba_mixer2.py
 
-
 from unittest.mock import patch
 
 import pytest
@@ -137,7 +136,3 @@ def mixer2_gated_norm_tensor_parallel(
         atol=5e-3,
         rtol=1e-3,
     )
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/test/srt/layers/attention/mamba/test_mamba_ssm.py b/test/srt/layers/attention/mamba/test_mamba_ssm.py
index 4a2c9a8e2..3e983a00e 100644
--- a/test/srt/layers/attention/mamba/test_mamba_ssm.py
+++ b/test/srt/layers/attention/mamba/test_mamba_ssm.py
@@ -1,6 +1,5 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/633f943e30a4444d890d26b81850f7217736f840/tests/kernels/mamba/test_mamba_ssm_ssd.py
 
-
 import pytest
 import torch
 import torch.nn.functional as F
@@ -290,7 +289,3 @@ def test_selective_state_update_with_heads_with_batch_indices(
     print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
     assert torch.allclose(state[state_indices, :], state_ref, rtol=rtol, atol=atol)
     assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/test/srt/layers/attention/mamba/test_mamba_ssm_ssd.py b/test/srt/layers/attention/mamba/test_mamba_ssm_ssd.py
index 10a7f3f80..493a179ee 100644
--- a/test/srt/layers/attention/mamba/test_mamba_ssm_ssd.py
+++ b/test/srt/layers/attention/mamba/test_mamba_ssm_ssd.py
@@ -8,12 +8,13 @@ from einops import rearrange, repeat
 
 from sglang.srt.layers.attention.mamba.mamba2_metadata import Mamba2Metadata
 from sglang.srt.layers.attention.mamba.ops import mamba_chunk_scan_combined
-from sglang.utils import is_in_ci
 
 # Added by the IBM Team, 2024
 
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/modules/ssd_minimal.py
 
+# TODO: These take a long time to run - we should cut down on some of the parameterized matrix.
+
 
 # this is the segsum implementation taken from above
 def segsum(x):
@@ -190,22 +191,10 @@ def generate_continuous_batched_examples(
         )
 
 
-SINGLE_ITYPE = [torch.float32, torch.float16, torch.bfloat16]
-SINGLE_NHEADS = [3, 4, 11, 16, 32]
-SINGLE_DHEAD = [5, 8, 19, 32, 128]
-SINGLE_SEQ_LEN_CHUNK_SIZE = [(112, 16), (128, 32)]
-
-if is_in_ci():
-    SINGLE_ITYPE = [torch.float32, torch.bfloat16]
-    SINGLE_NHEADS = [3, 32]
-    SINGLE_DHEAD = [5, 128]
-    SINGLE_SEQ_LEN_CHUNK_SIZE = [(112, 16)]
-
-
-@pytest.mark.parametrize("itype", SINGLE_ITYPE)
-@pytest.mark.parametrize("n_heads", SINGLE_NHEADS)
-@pytest.mark.parametrize("d_head", SINGLE_DHEAD)
-@pytest.mark.parametrize("seq_len_chunk_size", SINGLE_SEQ_LEN_CHUNK_SIZE)
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("n_heads", [3, 4, 11, 16, 32])
+@pytest.mark.parametrize("d_head", [5, 8, 19, 32, 128])
+@pytest.mark.parametrize("seq_len_chunk_size", [(112, 16), (128, 32)])
 def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, itype):
     if not torch.cuda.is_available():
         pytest.skip("CUDA device not available")
@@ -249,19 +238,9 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, it
     )
 
 
-BATCHED_ITYPE = [torch.float32, torch.float16]
-BATCHED_NHEADS = [4, 8, 13]
-BATCHED_DHEAD = [5, 16, 21, 32]
-
-if is_in_ci():
-    BATCHED_ITYPE = [torch.float32]
-    BATCHED_NHEADS = [4, 13]
-    BATCHED_DHEAD = [5, 32]
-
-
-@pytest.mark.parametrize("itype", BATCHED_ITYPE)
-@pytest.mark.parametrize("n_heads", BATCHED_NHEADS)
-@pytest.mark.parametrize("d_head", BATCHED_DHEAD)
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16])
+@pytest.mark.parametrize("n_heads", [4, 8, 13])
+@pytest.mark.parametrize("d_head", [5, 16, 21, 32])
 @pytest.mark.parametrize(
     "seq_len_chunk_size_cases",
     [
@@ -600,7 +579,3 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
             rtol=rtol,
             msg=lambda x: f"seq{i} state " + x,
         )  # noqa: B023
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/test/srt/models/test_nvidia_nemotron_nano_v2.py b/test/srt/models/test_nvidia_nemotron_nano_v2.py
index 4b414fbac..2fcb6fea0 100644
--- a/test/srt/models/test_nvidia_nemotron_nano_v2.py
+++ b/test/srt/models/test_nvidia_nemotron_nano_v2.py
@@ -1,7 +1,7 @@
 import unittest
 from types import SimpleNamespace
 
-from sglang.srt.utils import is_blackwell, kill_process_tree
+from sglang.srt.utils import kill_process_tree
 from sglang.test.few_shot_gsm8k import run_eval
 from sglang.test.test_utils import (
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
@@ -12,11 +12,9 @@ from sglang.test.test_utils import (
 
 
 class TestNvidiaNemotronNanoV2(CustomTestCase):
-    model = "nvidia/NVIDIA-Nemotron-Nano-9B-v2"
-    accuracy = 0.87
-
     @classmethod
     def setUpClass(cls):
+        cls.model = "nvidia/NVIDIA-Nemotron-Nano-9B-v2"
         cls.base_url = DEFAULT_URL_FOR_TEST
         cls.process = popen_launch_server(
             cls.model,
@@ -44,18 +42,7 @@ class TestNvidiaNemotronNanoV2(CustomTestCase):
         )
         metrics = run_eval(args)
         print(f"{metrics=}")
-        self.assertGreaterEqual(metrics["accuracy"], self.accuracy)
-
-
-class TestNvidiaNemotronNanoV2FP8(TestNvidiaNemotronNanoV2):
-    accuracy = 0.87
-    model = "nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8"
-
-
-@unittest.skipIf(not is_blackwell(), "NVFP4 only supported on blackwell")
-class TestNvidiaNemotronNanoV2NVFP4(TestNvidiaNemotronNanoV2):
-    accuracy = 0.855
-    model = "nvidia/NVIDIA-Nemotron-Nano-9B-v2-NVFP4"
+        self.assertGreater(metrics["accuracy"], 0.87)
 
 
 if __name__ == "__main__":
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 723073bf2..845e22ee6 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -19,9 +19,6 @@ suites = {
         TestFile("hicache/test_hicache_eagle.py", 150),
         TestFile("hicache/test_hicache_mla.py", 127),
         TestFile("hicache/test_hicache_storage.py", 127),
-        TestFile("layers/attention/mamba/test_causal_conv1d.py", 25),
-        TestFile("layers/attention/mamba/test_mamba_ssm.py", 50),
-        TestFile("layers/attention/mamba/test_mamba_ssm_ssd.py", 70),
         TestFile("lora/test_lora.py", 200),
         TestFile("lora/test_lora_eviction.py", 200),
         TestFile("lora/test_lora_eviction_policy.py", 200),
@@ -37,7 +34,7 @@ suites = {
         TestFile("models/test_embedding_models.py", 73),
         TestFile("models/test_encoder_embedding_models.py", 460),
         TestFile("models/test_generation_models.py", 103),
-        TestFile("models/test_nvidia_nemotron_nano_v2.py", 300),
+        TestFile("models/test_nvidia_nemotron_nano_v2.py", 180),
         TestFile("models/test_qwen_models.py", 82),
         TestFile("batch_invariant/test_batch_invariant_ops.py", 10),
         TestFile("models/test_reward_models.py", 132),
@@ -146,7 +143,7 @@ suites = {
         TestFile("hicache/test_hicache_storage_3fs_backend.py", 200),
         TestFile("hicache/test_hicache_storage_file_backend.py", 200),
         TestFile("hicache/test_hicache_storage_mooncake_backend.py", 400),
-        TestFile("layers/attention/mamba/test_mamba2_mixer.py", 50),
+        TestFile("layers/attention/mamba/test_mamba2_mixer.py", 110),
         TestFile("lora/test_lora_tp.py", 116),
         TestFile("models/test_glm4_moe_models.py", 100),
         TestFile("rl/test_update_weights_from_distributed.py", 103),