[CI] Add DeepSeek-V3.2-W8A8-Pruning e2e test (#5922)

### What this PR does / why we need it? 1. Fix DeepSeek-V3.2-W8A8-Pruning mtp 2. Add DeepSeek-V3.2-W8A8-Pruning e2e test ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: 11b6af5280 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-16 15:49:57 +08:00
parent 69b170b8b5
commit 4f446aec4c
4 changed files with 32 additions and 2 deletions
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -106,6 +106,8 @@ class EagleProposer(VllmEagleProposer):
        self.dcp_size = self.runner.dcp_size
        self.pcp_rank = self.runner.pcp_rank
        self.dcp_rank = self.runner.dcp_rank
+        
+        self.use_aclgraph = self.runner._use_aclgraph()

        self.full_indices = range(
            self.runner.max_num_tokens * self.pcp_size * self.dcp_size +
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -242,11 +242,11 @@ class MtpProposer(EagleProposer):
        assert self.runner is not None

        # Note(qcs): We may need to refactor these check logics.
-        if self.use_cuda_graph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[
+        if self.runner.use_aclgraph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[
                -1]:
            num_input_tokens = self.vllm_config.pad_for_cudagraph(
                num_scheduled_tokens)
-        elif self.use_cuda_graph and num_tokens <= self.runner.cudagraph_batch_sizes[
+        elif self.use_aclgraph  and num_tokens <= self.runner.cudagraph_batch_sizes[
                -1]:
            # Acl graph mode, add padding to the batch size
            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)