Qwen3-Next support (#10233)

Co-authored-by: cao1zhg <114661107+cao1zhg@users.noreply.github.com> Co-authored-by: ispobock <ispobaoke@gmail.com> Co-authored-by: Binyao Jiang <byjiang1996@gmail.com> Co-authored-by: hebiao064 <hebiaobuaa@gmail.com> Co-authored-by: Lifu Huang <lifu.hlf@gmail.com> Co-authored-by: qingquansong <ustcsqq@gmail.com> Co-authored-by: Yaoyao Ding <dingyaoyao.cs@gmail.com> Co-authored-by: Ke Bao <ISPObaoke@163.com> Co-authored-by: Minglei Zhu <mingleizhu1122@gmail.com>
2025-09-11 19:11:49 +08:00
parent bfe01a5eef
commit 30c6e1f569
19 changed files with 3224 additions and 8 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -95,6 +95,7 @@ ATTENTION_BACKEND_CHOICES = [
    "trtllm_mla",
    "trtllm_mha",
    "dual_chunk_flash_attn",
+    "hybrid_linear_attn",
    # AMD specific
    "aiter",
    "wave",
@@ -390,6 +391,10 @@ class ServerArgs:
    enable_pdmux: bool = False
    sm_group_num: int = 3

+    # Mamba cache
+    max_mamba_cache_size: Optional[int] = None
+    mamba_ssm_dtype: str = "float32"
+
    # Deprecated arguments
    enable_ep_moe: bool = False
    enable_deepep_moe: bool = False
@@ -835,6 +840,8 @@ class ServerArgs:
        os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
            "1" if self.enable_torch_compile else "0"
        )
+        os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
+
        # Set env var before grammar backends init
        os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
            "1" if self.disable_outlines_disk_cache else "0"
@@ -1714,7 +1721,20 @@ class ServerArgs:
            default=ServerArgs.moe_dense_tp_size,
            help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
        )
-
+        # Mamba Cache
+        parser.add_argument(
+            "--max-mamba-cache-size",
+            type=int,
+            default=ServerArgs.max_mamba_cache_size,
+            help="It is used for mamba cache memory static allocation.",
+        )
+        parser.add_argument(
+            "--mamba-ssm-dtype",
+            type=str,
+            default=ServerArgs.mamba_ssm_dtype,
+            choices=["float32", "bfloat16"],
+            help="It is used to tune mamba ssm dtype",
+        )
        # Hierarchical cache
        parser.add_argument(
            "--enable-hierarchical-cache",