[bugfix][ACLGraph][MTP] deletes cudagraph_batch_sizes in MtpProposer (#5183)
### What this PR does / why we need it?
This PR deletes `cudagraph_batch_sizes` in `MtpProposer` and reuses the
one in `NPUModelRunner`.
During our deployment of DeepSeek-V3.2 with MTP across machines 2P2D and
conducting AISBench stress testing, an error occurred (see below). After
investigation, we found that
`compilation_config.cudagraph_capture_sizes` is modified by
`adjust_cudagraph_sizes_for_spec_decode` in `NPUModelRunner`. This
modification only updates `cudagraph_batch_sizes` in `NPUModelRunner`
but is not synchronized to `MtpProposer`. After discussion (CC @yiz-liu)
, we believe it is unnecessary to maintain `cudagraph_batch_sizes` in
`MtpProposer`; it should directly use the variable from
`NPUModelRunner`.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
This commit is contained in:
@@ -97,7 +97,6 @@ class TestMtpProposer:
|
|||||||
proposer = MtpProposer(vllm_config, torch.device("cpu"), runner)
|
proposer = MtpProposer(vllm_config, torch.device("cpu"), runner)
|
||||||
|
|
||||||
assert proposer.use_aclgraph is True
|
assert proposer.use_aclgraph is True
|
||||||
assert proposer.cudagraph_batch_sizes == [1, 2, 4, 8]
|
|
||||||
|
|
||||||
@patch("vllm.config.get_layers_from_vllm_config")
|
@patch("vllm.config.get_layers_from_vllm_config")
|
||||||
@patch("vllm_ascend.spec_decode.mtp_proposer.get_model_loader")
|
@patch("vllm_ascend.spec_decode.mtp_proposer.get_model_loader")
|
||||||
|
|||||||
@@ -107,11 +107,6 @@ class MtpProposer(Proposer):
|
|||||||
|
|
||||||
self.use_aclgraph = self.runner._use_aclgraph()
|
self.use_aclgraph = self.runner._use_aclgraph()
|
||||||
|
|
||||||
self.cudagraph_batch_sizes = (list(
|
|
||||||
sorted(
|
|
||||||
self.vllm_config.compilation_config.cudagraph_capture_sizes))
|
|
||||||
if self.use_aclgraph else [])
|
|
||||||
|
|
||||||
# persistent buffers for aclgraph graph
|
# persistent buffers for aclgraph graph
|
||||||
self.input_ids = torch.zeros(self.max_num_tokens,
|
self.input_ids = torch.zeros(self.max_num_tokens,
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
@@ -697,11 +692,13 @@ class MtpProposer(Proposer):
|
|||||||
|
|
||||||
assert self.runner is not None
|
assert self.runner is not None
|
||||||
|
|
||||||
if self.runner.use_aclgraph and num_scheduled_tokens <= self.cudagraph_batch_sizes[
|
# Note(qcs): We may need to refactor these check logics.
|
||||||
|
if self.runner.use_aclgraph and num_scheduled_tokens <= self.runner.cudagraph_batch_sizes[
|
||||||
-1]:
|
-1]:
|
||||||
num_input_tokens = self.vllm_config.pad_for_cudagraph(
|
num_input_tokens = self.vllm_config.pad_for_cudagraph(
|
||||||
num_scheduled_tokens)
|
num_scheduled_tokens)
|
||||||
elif self.use_aclgraph and num_tokens <= self.cudagraph_batch_sizes[-1]:
|
elif self.use_aclgraph and num_tokens <= self.runner.cudagraph_batch_sizes[
|
||||||
|
-1]:
|
||||||
# Acl graph mode, add padding to the batch size
|
# Acl graph mode, add padding to the batch size
|
||||||
num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
|
num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user