Clean up some Qwen3-Next and deterministic code (#11585)

2025-10-15 00:19:37 -07:00
parent 6bc503af73
commit 6b143d62a2
4 changed files with 1 additions and 15 deletions
--- a/python/sglang/srt/configs/mamba_utils.py
+++ b/python/sglang/srt/configs/mamba_utils.py
@@ -70,7 +70,7 @@ class Mamba2StateShape:
        # These are not TP-ed as they depend on A, dt_bias, D
        # - they are typically small
-        #   e.g., (h_heads, head_dim, state_size) = (128, 64, 128)
+        #   e.g., QWen3-Next: (32, 128, 128)
        temporal_state_shape = (divide(num_heads, tp_world_size), head_dim, state_size)
        return Mamba2StateShape(
            conv=conv_state_shape,
--- a/python/sglang/srt/configs/qwen3_next.py
+++ b/python/sglang/srt/configs/qwen3_next.py
@@ -27,12 +27,9 @@ from sglang.srt.layers.dp_attention import get_attention_tp_size
 logger = logging.get_logger(__name__)
 # NOTE: HybridLayerType
 class HybridLayerType(enum.Enum):
    full_attention = "attention"
    swa_attention = "swa_attention"
    linear_attention = "linear_attention"
    mamba2 = "mamba"
 class Qwen3NextConfig(PretrainedConfig):
--- a/python/sglang/srt/models/falcon_h1.py
+++ b/python/sglang/srt/models/falcon_h1.py
@@ -450,13 +450,6 @@ class FalconH1Model(nn.Module):
        return hidden_states
 class HybridLayerType(enum.Enum):
    full_attention = "attention"
    swa_attention = "swa_attention"
    linear_attention = "linear_attention"
    mamba2 = "mamba"
 class FalconH1ForCausalLM(nn.Module):
    fall_back_to_pt_during_load = False
--- a/python/sglang/test/test_deterministic.py
+++ b/python/sglang/test/test_deterministic.py
@@ -226,10 +226,6 @@ def send_prefix(args, batch_size: int, prompts: List[str]):
 def test_deterministic(args):
    # First do some warmups
    for i in range(3):
        send_single(args, 16, args.profile)
    if args.test_mode == "single":
        # In single mode, we test the deterministic behavior by sending the same prompt in batch sizes ranging from 1 to n_trials.
        texts = []