[Bugfix] Fix Qwen3/DSV3/DSV3.2 model support (#11510)

2025-10-16 15:14:09 +08:00
parent b0d20cdec7
commit 3cceaa381a
12 changed files with 102 additions and 33 deletions
--- a/python/sglang/srt/layers/attention/ascend_backend.py
+++ b/python/sglang/srt/layers/attention/ascend_backend.py
@@ -356,6 +356,11 @@ class AscendAttnBackend(AttentionBackend):
            assert (
                layer.qk_head_dim != layer.v_head_dim
            ), "FIA only supports qk_head_dim != v_head_dim"
+            num_token_padding = q.shape[0]
+            q, k, v = [
+                data[: forward_batch.num_token_non_padded_cpu] for data in [q, k, v]
+            ]
+
            q_nope, q_rope = q.split([layer.v_head_dim, self.qk_rope_head_dim], dim=-1)
            k_nope, k_rope = k.split([layer.v_head_dim, self.qk_rope_head_dim], dim=-1)

@@ -375,6 +380,18 @@ class AscendAttnBackend(AttentionBackend):
                next_tokens=0,
            )

+            attn_output = attn_output.reshape(-1, layer.tp_q_head_num, layer.v_head_dim)
+            if num_token_padding != forward_batch.num_token_non_padded_cpu:
+                attn_output = torch.cat(
+                    [
+                        attn_output,
+                        attn_output.new_zeros(
+                            num_token_padding - attn_output.shape[0],
+                            *attn_output.shape[1:],
+                        ),
+                    ],
+                    dim=0,
+                )
        return attn_output

    def forward_decode_graph(
--- a/python/sglang/srt/mem_cache/allocator_ascend.py
+++ b/python/sglang/srt/mem_cache/allocator_ascend.py
@@ -119,7 +119,7 @@ class AscendPagedTokenToKVPoolAllocator(PagedTokenToKVPoolAllocator):
            assert len(torch.unique(out_indices)) == len(out_indices)

        self.free_pages = self.free_pages[num_new_pages_item:]
-        return out_indices
+        return out_indices.int()

    def alloc_decode(
        self,
--- a/python/sglang/srt/mem_cache/common.py
+++ b/python/sglang/srt/mem_cache/common.py
@@ -347,11 +347,7 @@ def alloc_for_extend(
    else:
        # Paged allocation - build last_loc
        last_loc = [
-            (
-                t[-1:]
-                if len(t) > 0
-                else torch.tensor([-1], device=batch.tree_cache.device)
-            )
+            (t[-1:] if len(t) > 0 else torch.tensor([-1], device=batch.device))
            for t in prefix_tensors
        ]
        out_cache_loc = alloc_paged_token_slots_extend(
--- a/python/sglang/srt/model_executor/npu_graph_runner.py
+++ b/python/sglang/srt/model_executor/npu_graph_runner.py
@@ -22,7 +22,7 @@ from typing import TYPE_CHECKING, Optional, Union
 import numpy as np
 import torch

-from sglang.srt.configs.model_config import AttentionArch
+from sglang.srt.configs.model_config import AttentionArch, is_deepseek_nsa
 from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner

 logger = logging.getLogger(__name__)
@@ -75,7 +75,7 @@ class NPUGraphRunner(CudaGraphRunner):
            self.positions[: self.raw_num_token].copy_(forward_batch.positions)

        # Replay
-        if self.model_runner.model_config.index_head_dim is None:
+        if not is_deepseek_nsa(self.model_runner.model_config.hf_config):
            seq_lens = forward_batch.seq_lens.cpu().tolist() + [0] * (
                self.bs - self.raw_bs
            )
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -1357,6 +1357,7 @@ class DeepseekV2AttentionMLA(nn.Module):
                inner_state = self.mla_preprocess.forward(
                    positions, hidden_states, forward_batch, zero_allocator
                )
+                inner_state = (*inner_state, None)  # add a position for topk_indices
        elif attn_forward_method == AttnForwardMethod.NPU_MLA_SPARSE:
            inner_state = self.forward_npu_sparse_prepare(
                positions, hidden_states, forward_batch, zero_allocator
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -628,6 +628,16 @@ class ServerArgs:
                    self.chunked_prefill_size = 2048
                if self.cuda_graph_max_bs is None:
                    self.cuda_graph_max_bs = 8
+            elif is_npu() and gpu_mem < 32 * 1024:
+                # Atlas A2B4
+                # (chunked_prefill_size 32k, cuda_graph_max_bs 16 if tp < 4 else 64)
+                if self.chunked_prefill_size is None:
+                    self.chunked_prefill_size = 32768
+                if self.cuda_graph_max_bs is None:
+                    if self.tp_size < 4:
+                        self.cuda_graph_max_bs = 16
+                    else:
+                        self.cuda_graph_max_bs = 64
            elif gpu_mem < 35 * 1024:
                # A10, 4090, 5090
                # (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
@@ -651,6 +661,16 @@ class ServerArgs:
                        self.cuda_graph_max_bs = 32
                    else:
                        self.cuda_graph_max_bs = 160
+            elif is_npu() and gpu_mem < 64 * 1024:
+                # Atlas A2 and Atlas A3
+                # (chunked_prefill_size 32k, cuda_graph_max_bs 64 if tp < 4 else 128)
+                if self.chunked_prefill_size is None:
+                    self.chunked_prefill_size = 32768
+                if self.cuda_graph_max_bs is None:
+                    if self.tp_size < 4:
+                        self.cuda_graph_max_bs = 64
+                    else:
+                        self.cuda_graph_max_bs = 128
            elif gpu_mem < 90 * 1024:
                # H100, A100
                # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)