[CI] Add DeepSeek-V3.2-W8A8-Pruning e2e test (#5922)
### What this PR does / why we need it?
1. Fix DeepSeek-V3.2-W8A8-Pruning mtp
2. Add DeepSeek-V3.2-W8A8-Pruning e2e test
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
11b6af5280
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
@@ -106,6 +106,8 @@ class EagleProposer(VllmEagleProposer):
|
||||
self.dcp_size = self.runner.dcp_size
|
||||
self.pcp_rank = self.runner.pcp_rank
|
||||
self.dcp_rank = self.runner.dcp_rank
|
||||
|
||||
self.use_aclgraph = self.runner._use_aclgraph()
|
||||
|
||||
self.full_indices = range(
|
||||
self.runner.max_num_tokens * self.pcp_size * self.dcp_size +
|
||||
|
||||
@@ -242,11 +242,11 @@ class MtpProposer(EagleProposer):
|
||||
assert self.runner is not None
|
||||
|
||||
# Note(qcs): We may need to refactor these check logics.
|
||||
if self.use_cuda_graph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[
|
||||
if self.runner.use_aclgraph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[
|
||||
-1]:
|
||||
num_input_tokens = self.vllm_config.pad_for_cudagraph(
|
||||
num_scheduled_tokens)
|
||||
elif self.use_cuda_graph and num_tokens <= self.runner.cudagraph_batch_sizes[
|
||||
elif self.use_aclgraph and num_tokens <= self.runner.cudagraph_batch_sizes[
|
||||
-1]:
|
||||
# Acl graph mode, add padding to the batch size
|
||||
num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
|
||||
|
||||
Reference in New Issue
Block a user