[Bugfix] Fix Qwen3/DSV3/DSV3.2 model support (#11510)
This commit is contained in:
@@ -356,6 +356,11 @@ class AscendAttnBackend(AttentionBackend):
|
||||
assert (
|
||||
layer.qk_head_dim != layer.v_head_dim
|
||||
), "FIA only supports qk_head_dim != v_head_dim"
|
||||
num_token_padding = q.shape[0]
|
||||
q, k, v = [
|
||||
data[: forward_batch.num_token_non_padded_cpu] for data in [q, k, v]
|
||||
]
|
||||
|
||||
q_nope, q_rope = q.split([layer.v_head_dim, self.qk_rope_head_dim], dim=-1)
|
||||
k_nope, k_rope = k.split([layer.v_head_dim, self.qk_rope_head_dim], dim=-1)
|
||||
|
||||
@@ -375,6 +380,18 @@ class AscendAttnBackend(AttentionBackend):
|
||||
next_tokens=0,
|
||||
)
|
||||
|
||||
attn_output = attn_output.reshape(-1, layer.tp_q_head_num, layer.v_head_dim)
|
||||
if num_token_padding != forward_batch.num_token_non_padded_cpu:
|
||||
attn_output = torch.cat(
|
||||
[
|
||||
attn_output,
|
||||
attn_output.new_zeros(
|
||||
num_token_padding - attn_output.shape[0],
|
||||
*attn_output.shape[1:],
|
||||
),
|
||||
],
|
||||
dim=0,
|
||||
)
|
||||
return attn_output
|
||||
|
||||
def forward_decode_graph(
|
||||
|
||||
@@ -119,7 +119,7 @@ class AscendPagedTokenToKVPoolAllocator(PagedTokenToKVPoolAllocator):
|
||||
assert len(torch.unique(out_indices)) == len(out_indices)
|
||||
|
||||
self.free_pages = self.free_pages[num_new_pages_item:]
|
||||
return out_indices
|
||||
return out_indices.int()
|
||||
|
||||
def alloc_decode(
|
||||
self,
|
||||
|
||||
@@ -347,11 +347,7 @@ def alloc_for_extend(
|
||||
else:
|
||||
# Paged allocation - build last_loc
|
||||
last_loc = [
|
||||
(
|
||||
t[-1:]
|
||||
if len(t) > 0
|
||||
else torch.tensor([-1], device=batch.tree_cache.device)
|
||||
)
|
||||
(t[-1:] if len(t) > 0 else torch.tensor([-1], device=batch.device))
|
||||
for t in prefix_tensors
|
||||
]
|
||||
out_cache_loc = alloc_paged_token_slots_extend(
|
||||
|
||||
@@ -22,7 +22,7 @@ from typing import TYPE_CHECKING, Optional, Union
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from sglang.srt.configs.model_config import AttentionArch
|
||||
from sglang.srt.configs.model_config import AttentionArch, is_deepseek_nsa
|
||||
from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -75,7 +75,7 @@ class NPUGraphRunner(CudaGraphRunner):
|
||||
self.positions[: self.raw_num_token].copy_(forward_batch.positions)
|
||||
|
||||
# Replay
|
||||
if self.model_runner.model_config.index_head_dim is None:
|
||||
if not is_deepseek_nsa(self.model_runner.model_config.hf_config):
|
||||
seq_lens = forward_batch.seq_lens.cpu().tolist() + [0] * (
|
||||
self.bs - self.raw_bs
|
||||
)
|
||||
|
||||
@@ -1357,6 +1357,7 @@ class DeepseekV2AttentionMLA(nn.Module):
|
||||
inner_state = self.mla_preprocess.forward(
|
||||
positions, hidden_states, forward_batch, zero_allocator
|
||||
)
|
||||
inner_state = (*inner_state, None) # add a position for topk_indices
|
||||
elif attn_forward_method == AttnForwardMethod.NPU_MLA_SPARSE:
|
||||
inner_state = self.forward_npu_sparse_prepare(
|
||||
positions, hidden_states, forward_batch, zero_allocator
|
||||
|
||||
@@ -628,6 +628,16 @@ class ServerArgs:
|
||||
self.chunked_prefill_size = 2048
|
||||
if self.cuda_graph_max_bs is None:
|
||||
self.cuda_graph_max_bs = 8
|
||||
elif is_npu() and gpu_mem < 32 * 1024:
|
||||
# Atlas A2B4
|
||||
# (chunked_prefill_size 32k, cuda_graph_max_bs 16 if tp < 4 else 64)
|
||||
if self.chunked_prefill_size is None:
|
||||
self.chunked_prefill_size = 32768
|
||||
if self.cuda_graph_max_bs is None:
|
||||
if self.tp_size < 4:
|
||||
self.cuda_graph_max_bs = 16
|
||||
else:
|
||||
self.cuda_graph_max_bs = 64
|
||||
elif gpu_mem < 35 * 1024:
|
||||
# A10, 4090, 5090
|
||||
# (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
|
||||
@@ -651,6 +661,16 @@ class ServerArgs:
|
||||
self.cuda_graph_max_bs = 32
|
||||
else:
|
||||
self.cuda_graph_max_bs = 160
|
||||
elif is_npu() and gpu_mem < 64 * 1024:
|
||||
# Atlas A2 and Atlas A3
|
||||
# (chunked_prefill_size 32k, cuda_graph_max_bs 64 if tp < 4 else 128)
|
||||
if self.chunked_prefill_size is None:
|
||||
self.chunked_prefill_size = 32768
|
||||
if self.cuda_graph_max_bs is None:
|
||||
if self.tp_size < 4:
|
||||
self.cuda_graph_max_bs = 64
|
||||
else:
|
||||
self.cuda_graph_max_bs = 128
|
||||
elif gpu_mem < 90 * 1024:
|
||||
# H100, A100
|
||||
# (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
|
||||
|
||||
Reference in New Issue
Block a user