[CI] Add DeepSeek-V3.2-W8A8-Pruning e2e test (#5922)

### What this PR does / why we need it?
1. Fix DeepSeek-V3.2-W8A8-Pruning mtp
2. Add DeepSeek-V3.2-W8A8-Pruning e2e test

### How was this patch tested?

- vLLM version: v0.13.0
- vLLM main:
11b6af5280

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
zhangxinyuehfad
2026-01-16 15:49:57 +08:00
committed by GitHub
parent 69b170b8b5
commit 4f446aec4c
4 changed files with 32 additions and 2 deletions

View File

@@ -106,6 +106,8 @@ class EagleProposer(VllmEagleProposer):
self.dcp_size = self.runner.dcp_size
self.pcp_rank = self.runner.pcp_rank
self.dcp_rank = self.runner.dcp_rank
self.use_aclgraph = self.runner._use_aclgraph()
self.full_indices = range(
self.runner.max_num_tokens * self.pcp_size * self.dcp_size +

View File

@@ -242,11 +242,11 @@ class MtpProposer(EagleProposer):
assert self.runner is not None
# Note(qcs): We may need to refactor these check logics.
if self.use_cuda_graph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[
if self.runner.use_aclgraph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[
-1]:
num_input_tokens = self.vllm_config.pad_for_cudagraph(
num_scheduled_tokens)
elif self.use_cuda_graph and num_tokens <= self.runner.cudagraph_batch_sizes[
elif self.use_aclgraph and num_tokens <= self.runner.cudagraph_batch_sizes[
-1]:
# Acl graph mode, add padding to the batch size
num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)