[Bugfix] Fix Qwen3/DSV3/DSV3.2 model support (#11510)

This commit is contained in:
Even Zhou
2025-10-16 15:14:09 +08:00
committed by GitHub
parent b0d20cdec7
commit 3cceaa381a
12 changed files with 102 additions and 33 deletions

View File

@@ -356,6 +356,11 @@ class AscendAttnBackend(AttentionBackend):
assert (
layer.qk_head_dim != layer.v_head_dim
), "FIA only supports qk_head_dim != v_head_dim"
num_token_padding = q.shape[0]
q, k, v = [
data[: forward_batch.num_token_non_padded_cpu] for data in [q, k, v]
]
q_nope, q_rope = q.split([layer.v_head_dim, self.qk_rope_head_dim], dim=-1)
k_nope, k_rope = k.split([layer.v_head_dim, self.qk_rope_head_dim], dim=-1)
@@ -375,6 +380,18 @@ class AscendAttnBackend(AttentionBackend):
next_tokens=0,
)
attn_output = attn_output.reshape(-1, layer.tp_q_head_num, layer.v_head_dim)
if num_token_padding != forward_batch.num_token_non_padded_cpu:
attn_output = torch.cat(
[
attn_output,
attn_output.new_zeros(
num_token_padding - attn_output.shape[0],
*attn_output.shape[1:],
),
],
dim=0,
)
return attn_output
def forward_decode_graph(

View File

@@ -119,7 +119,7 @@ class AscendPagedTokenToKVPoolAllocator(PagedTokenToKVPoolAllocator):
assert len(torch.unique(out_indices)) == len(out_indices)
self.free_pages = self.free_pages[num_new_pages_item:]
return out_indices
return out_indices.int()
def alloc_decode(
self,

View File

@@ -347,11 +347,7 @@ def alloc_for_extend(
else:
# Paged allocation - build last_loc
last_loc = [
(
t[-1:]
if len(t) > 0
else torch.tensor([-1], device=batch.tree_cache.device)
)
(t[-1:] if len(t) > 0 else torch.tensor([-1], device=batch.device))
for t in prefix_tensors
]
out_cache_loc = alloc_paged_token_slots_extend(

View File

@@ -22,7 +22,7 @@ from typing import TYPE_CHECKING, Optional, Union
import numpy as np
import torch
from sglang.srt.configs.model_config import AttentionArch
from sglang.srt.configs.model_config import AttentionArch, is_deepseek_nsa
from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
logger = logging.getLogger(__name__)
@@ -75,7 +75,7 @@ class NPUGraphRunner(CudaGraphRunner):
self.positions[: self.raw_num_token].copy_(forward_batch.positions)
# Replay
if self.model_runner.model_config.index_head_dim is None:
if not is_deepseek_nsa(self.model_runner.model_config.hf_config):
seq_lens = forward_batch.seq_lens.cpu().tolist() + [0] * (
self.bs - self.raw_bs
)

View File

@@ -1357,6 +1357,7 @@ class DeepseekV2AttentionMLA(nn.Module):
inner_state = self.mla_preprocess.forward(
positions, hidden_states, forward_batch, zero_allocator
)
inner_state = (*inner_state, None) # add a position for topk_indices
elif attn_forward_method == AttnForwardMethod.NPU_MLA_SPARSE:
inner_state = self.forward_npu_sparse_prepare(
positions, hidden_states, forward_batch, zero_allocator

View File

@@ -628,6 +628,16 @@ class ServerArgs:
self.chunked_prefill_size = 2048
if self.cuda_graph_max_bs is None:
self.cuda_graph_max_bs = 8
elif is_npu() and gpu_mem < 32 * 1024:
# Atlas A2B4
# (chunked_prefill_size 32k, cuda_graph_max_bs 16 if tp < 4 else 64)
if self.chunked_prefill_size is None:
self.chunked_prefill_size = 32768
if self.cuda_graph_max_bs is None:
if self.tp_size < 4:
self.cuda_graph_max_bs = 16
else:
self.cuda_graph_max_bs = 64
elif gpu_mem < 35 * 1024:
# A10, 4090, 5090
# (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
@@ -651,6 +661,16 @@ class ServerArgs:
self.cuda_graph_max_bs = 32
else:
self.cuda_graph_max_bs = 160
elif is_npu() and gpu_mem < 64 * 1024:
# Atlas A2 and Atlas A3
# (chunked_prefill_size 32k, cuda_graph_max_bs 64 if tp < 4 else 128)
if self.chunked_prefill_size is None:
self.chunked_prefill_size = 32768
if self.cuda_graph_max_bs is None:
if self.tp_size < 4:
self.cuda_graph_max_bs = 64
else:
self.cuda_graph_max_bs = 128
elif gpu_mem < 90 * 1024:
# H100, A100
# (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)