Support DeepSeek V3.2 Exp (#11061)
Co-authored-by: Stefan He <11166516+hebiao064@users.noreply.github.com> Co-authored-by: Liangsheng Yin <95566987+hnyls2002@users.noreply.github.com> Co-authored-by: Baizhou Zhang <56809903+fridge003@users.noreply.github.com> Co-authored-by: DarkSharpness <76582120+darksharpness@users.noreply.github.com> Co-authored-by: ZhengdQin <46387172+zhengdqin@users.noreply.github.com> Co-authored-by: DarkSharpness <2040703891@qq.com> Co-authored-by: hnyls2002 <lsyincs@gmail.com> Co-authored-by: Zhengda Qin <zhengdqin@gmail.com> Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com> Co-authored-by: HAI <hixiao@gmail.com> Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
This commit is contained in:
@@ -31,7 +31,12 @@ import torch.distributed as dist
|
||||
|
||||
from sglang.srt.configs.device_config import DeviceConfig
|
||||
from sglang.srt.configs.load_config import LoadConfig, LoadFormat
|
||||
from sglang.srt.configs.model_config import AttentionArch, ModelConfig
|
||||
from sglang.srt.configs.model_config import (
|
||||
AttentionArch,
|
||||
ModelConfig,
|
||||
get_nsa_index_head_dim,
|
||||
is_deepseek_nsa,
|
||||
)
|
||||
from sglang.srt.configs.update_config import adjust_config_with_unaligned_cpu_tp
|
||||
from sglang.srt.constants import GPU_MEMORY_TYPE_WEIGHTS
|
||||
from sglang.srt.distributed import (
|
||||
@@ -96,6 +101,7 @@ from sglang.srt.mem_cache.memory_pool import (
|
||||
HybridReqToTokenPool,
|
||||
MHATokenToKVPool,
|
||||
MLATokenToKVPool,
|
||||
NSATokenToKVPool,
|
||||
ReqToTokenPool,
|
||||
SWAKVPool,
|
||||
)
|
||||
@@ -157,6 +163,7 @@ MLA_ATTENTION_BACKENDS = [
|
||||
"cutlass_mla",
|
||||
"trtllm_mla",
|
||||
"ascend",
|
||||
"nsa",
|
||||
]
|
||||
|
||||
|
||||
@@ -1547,6 +1554,7 @@ class ModelRunner:
|
||||
assert self.is_draft_worker
|
||||
|
||||
# Initialize token_to_kv_pool
|
||||
is_nsa_model = is_deepseek_nsa(self.model_config.hf_config)
|
||||
if self.server_args.attention_backend == "ascend":
|
||||
if self.use_mla_backend:
|
||||
self.token_to_kv_pool = AscendMLAPagedTokenToKVPool(
|
||||
@@ -1555,6 +1563,7 @@ class ModelRunner:
|
||||
dtype=self.kv_cache_dtype,
|
||||
kv_lora_rank=self.model_config.kv_lora_rank,
|
||||
qk_rope_head_dim=self.model_config.qk_rope_head_dim,
|
||||
index_head_dim=self.model_config.index_head_dim,
|
||||
layer_num=self.num_effective_layers,
|
||||
device=self.device,
|
||||
enable_memory_saver=self.server_args.enable_memory_saver,
|
||||
@@ -1574,7 +1583,22 @@ class ModelRunner:
|
||||
device=self.device,
|
||||
enable_memory_saver=self.server_args.enable_memory_saver,
|
||||
)
|
||||
elif self.use_mla_backend and is_nsa_model:
|
||||
self.token_to_kv_pool = NSATokenToKVPool(
|
||||
self.max_total_num_tokens,
|
||||
page_size=self.page_size,
|
||||
dtype=self.kv_cache_dtype,
|
||||
kv_lora_rank=self.model_config.kv_lora_rank,
|
||||
qk_rope_head_dim=self.model_config.qk_rope_head_dim,
|
||||
layer_num=self.num_effective_layers,
|
||||
device=self.device,
|
||||
enable_memory_saver=self.server_args.enable_memory_saver,
|
||||
start_layer=self.start_layer,
|
||||
end_layer=self.end_layer,
|
||||
index_head_dim=get_nsa_index_head_dim(self.model_config.hf_config),
|
||||
)
|
||||
elif self.use_mla_backend:
|
||||
assert not is_nsa_model
|
||||
self.token_to_kv_pool = MLATokenToKVPool(
|
||||
self.max_total_num_tokens,
|
||||
page_size=self.page_size,
|
||||
|
||||
Reference in New Issue
Block a user