Qwen3-Next support (#10233)

Co-authored-by: cao1zhg <114661107+cao1zhg@users.noreply.github.com>
Co-authored-by: ispobock <ispobaoke@gmail.com>
Co-authored-by: Binyao Jiang <byjiang1996@gmail.com>
Co-authored-by: hebiao064 <hebiaobuaa@gmail.com>
Co-authored-by: Lifu Huang <lifu.hlf@gmail.com>
Co-authored-by: qingquansong <ustcsqq@gmail.com>
Co-authored-by: Yaoyao Ding <dingyaoyao.cs@gmail.com>
Co-authored-by: Ke Bao <ISPObaoke@163.com>
Co-authored-by: Minglei Zhu <mingleizhu1122@gmail.com>
This commit is contained in:
Yi Zhang
2025-09-11 19:11:49 +08:00
committed by GitHub
parent bfe01a5eef
commit 30c6e1f569
19 changed files with 3224 additions and 8 deletions

View File

@@ -95,6 +95,7 @@ ATTENTION_BACKEND_CHOICES = [
"trtllm_mla",
"trtllm_mha",
"dual_chunk_flash_attn",
"hybrid_linear_attn",
# AMD specific
"aiter",
"wave",
@@ -390,6 +391,10 @@ class ServerArgs:
enable_pdmux: bool = False
sm_group_num: int = 3
# Mamba cache
max_mamba_cache_size: Optional[int] = None
mamba_ssm_dtype: str = "float32"
# Deprecated arguments
enable_ep_moe: bool = False
enable_deepep_moe: bool = False
@@ -835,6 +840,8 @@ class ServerArgs:
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
"1" if self.enable_torch_compile else "0"
)
os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
# Set env var before grammar backends init
os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
"1" if self.disable_outlines_disk_cache else "0"
@@ -1714,7 +1721,20 @@ class ServerArgs:
default=ServerArgs.moe_dense_tp_size,
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
)
# Mamba Cache
parser.add_argument(
"--max-mamba-cache-size",
type=int,
default=ServerArgs.max_mamba_cache_size,
help="It is used for mamba cache memory static allocation.",
)
parser.add_argument(
"--mamba-ssm-dtype",
type=str,
default=ServerArgs.mamba_ssm_dtype,
choices=["float32", "bfloat16"],
help="It is used to tune mamba ssm dtype",
)
# Hierarchical cache
parser.add_argument(
"--enable-hierarchical-cache",