Qwen3-Next support (#10233)
Co-authored-by: cao1zhg <114661107+cao1zhg@users.noreply.github.com> Co-authored-by: ispobock <ispobaoke@gmail.com> Co-authored-by: Binyao Jiang <byjiang1996@gmail.com> Co-authored-by: hebiao064 <hebiaobuaa@gmail.com> Co-authored-by: Lifu Huang <lifu.hlf@gmail.com> Co-authored-by: qingquansong <ustcsqq@gmail.com> Co-authored-by: Yaoyao Ding <dingyaoyao.cs@gmail.com> Co-authored-by: Ke Bao <ISPObaoke@163.com> Co-authored-by: Minglei Zhu <mingleizhu1122@gmail.com>
This commit is contained in:
@@ -95,6 +95,7 @@ ATTENTION_BACKEND_CHOICES = [
|
||||
"trtllm_mla",
|
||||
"trtllm_mha",
|
||||
"dual_chunk_flash_attn",
|
||||
"hybrid_linear_attn",
|
||||
# AMD specific
|
||||
"aiter",
|
||||
"wave",
|
||||
@@ -390,6 +391,10 @@ class ServerArgs:
|
||||
enable_pdmux: bool = False
|
||||
sm_group_num: int = 3
|
||||
|
||||
# Mamba cache
|
||||
max_mamba_cache_size: Optional[int] = None
|
||||
mamba_ssm_dtype: str = "float32"
|
||||
|
||||
# Deprecated arguments
|
||||
enable_ep_moe: bool = False
|
||||
enable_deepep_moe: bool = False
|
||||
@@ -835,6 +840,8 @@ class ServerArgs:
|
||||
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
|
||||
"1" if self.enable_torch_compile else "0"
|
||||
)
|
||||
os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
|
||||
|
||||
# Set env var before grammar backends init
|
||||
os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
|
||||
"1" if self.disable_outlines_disk_cache else "0"
|
||||
@@ -1714,7 +1721,20 @@ class ServerArgs:
|
||||
default=ServerArgs.moe_dense_tp_size,
|
||||
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
|
||||
)
|
||||
|
||||
# Mamba Cache
|
||||
parser.add_argument(
|
||||
"--max-mamba-cache-size",
|
||||
type=int,
|
||||
default=ServerArgs.max_mamba_cache_size,
|
||||
help="It is used for mamba cache memory static allocation.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mamba-ssm-dtype",
|
||||
type=str,
|
||||
default=ServerArgs.mamba_ssm_dtype,
|
||||
choices=["float32", "bfloat16"],
|
||||
help="It is used to tune mamba ssm dtype",
|
||||
)
|
||||
# Hierarchical cache
|
||||
parser.add_argument(
|
||||
"--enable-hierarchical-cache",
|
||||
|
||||
Reference in New Issue
Block a user