Support DeepSeek V3.2 Exp (#11061)

Co-authored-by: Stefan He <11166516+hebiao064@users.noreply.github.com>
Co-authored-by: Liangsheng Yin <95566987+hnyls2002@users.noreply.github.com>
Co-authored-by: Baizhou Zhang <56809903+fridge003@users.noreply.github.com>
Co-authored-by: DarkSharpness <76582120+darksharpness@users.noreply.github.com>
Co-authored-by: ZhengdQin <46387172+zhengdqin@users.noreply.github.com>
Co-authored-by: DarkSharpness <2040703891@qq.com>
Co-authored-by: hnyls2002 <lsyincs@gmail.com>
Co-authored-by: Zhengda Qin <zhengdqin@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: HAI <hixiao@gmail.com>
Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
This commit is contained in:
fzyzcjy
2025-10-06 15:24:15 +08:00
committed by GitHub
parent 292a867ad9
commit efbc687c28
29 changed files with 4540 additions and 139 deletions

View File

@@ -31,7 +31,12 @@ import torch.distributed as dist
from sglang.srt.configs.device_config import DeviceConfig
from sglang.srt.configs.load_config import LoadConfig, LoadFormat
from sglang.srt.configs.model_config import AttentionArch, ModelConfig
from sglang.srt.configs.model_config import (
AttentionArch,
ModelConfig,
get_nsa_index_head_dim,
is_deepseek_nsa,
)
from sglang.srt.configs.update_config import adjust_config_with_unaligned_cpu_tp
from sglang.srt.constants import GPU_MEMORY_TYPE_WEIGHTS
from sglang.srt.distributed import (
@@ -96,6 +101,7 @@ from sglang.srt.mem_cache.memory_pool import (
HybridReqToTokenPool,
MHATokenToKVPool,
MLATokenToKVPool,
NSATokenToKVPool,
ReqToTokenPool,
SWAKVPool,
)
@@ -157,6 +163,7 @@ MLA_ATTENTION_BACKENDS = [
"cutlass_mla",
"trtllm_mla",
"ascend",
"nsa",
]
@@ -1547,6 +1554,7 @@ class ModelRunner:
assert self.is_draft_worker
# Initialize token_to_kv_pool
is_nsa_model = is_deepseek_nsa(self.model_config.hf_config)
if self.server_args.attention_backend == "ascend":
if self.use_mla_backend:
self.token_to_kv_pool = AscendMLAPagedTokenToKVPool(
@@ -1555,6 +1563,7 @@ class ModelRunner:
dtype=self.kv_cache_dtype,
kv_lora_rank=self.model_config.kv_lora_rank,
qk_rope_head_dim=self.model_config.qk_rope_head_dim,
index_head_dim=self.model_config.index_head_dim,
layer_num=self.num_effective_layers,
device=self.device,
enable_memory_saver=self.server_args.enable_memory_saver,
@@ -1574,7 +1583,22 @@ class ModelRunner:
device=self.device,
enable_memory_saver=self.server_args.enable_memory_saver,
)
elif self.use_mla_backend and is_nsa_model:
self.token_to_kv_pool = NSATokenToKVPool(
self.max_total_num_tokens,
page_size=self.page_size,
dtype=self.kv_cache_dtype,
kv_lora_rank=self.model_config.kv_lora_rank,
qk_rope_head_dim=self.model_config.qk_rope_head_dim,
layer_num=self.num_effective_layers,
device=self.device,
enable_memory_saver=self.server_args.enable_memory_saver,
start_layer=self.start_layer,
end_layer=self.end_layer,
index_head_dim=get_nsa_index_head_dim(self.model_config.hf_config),
)
elif self.use_mla_backend:
assert not is_nsa_model
self.token_to_kv_pool = MLATokenToKVPool(
self.max_total_num_tokens,
page_size=self.page_size,