[Bugfix] Fix the attn_metadata is None (#5038)
### What this PR does / why we need it?
Fix the bug " TypeError: 'NoneType' object is not iterable' " in
vllm_ascend/compilation/acl_graph.py
The reason of that is the attn_metadata is none in the dummy_run of MTP.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: chenmenglong <chenmenglong1@huawei.com>
This commit is contained in:
@@ -117,7 +117,7 @@ class EagleProposer(Proposer):
|
|||||||
def dummy_run(self,
|
def dummy_run(self,
|
||||||
num_tokens: int,
|
num_tokens: int,
|
||||||
with_prefill: bool = False,
|
with_prefill: bool = False,
|
||||||
skip_attn: bool = False,
|
in_graph_capturing: bool = False,
|
||||||
num_reqs: int = 0,
|
num_reqs: int = 0,
|
||||||
num_tokens_across_dp: Optional[torch.Tensor] = None,
|
num_tokens_across_dp: Optional[torch.Tensor] = None,
|
||||||
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ class Proposer:
|
|||||||
def dummy_run(self,
|
def dummy_run(self,
|
||||||
num_tokens: int,
|
num_tokens: int,
|
||||||
with_prefill: bool = False,
|
with_prefill: bool = False,
|
||||||
skip_attn: bool = False,
|
in_graph_capturing: bool = False,
|
||||||
num_reqs: int = 0,
|
num_reqs: int = 0,
|
||||||
num_tokens_across_dp: Optional[torch.Tensor] = None,
|
num_tokens_across_dp: Optional[torch.Tensor] = None,
|
||||||
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
||||||
|
|||||||
@@ -223,7 +223,7 @@ class MtpProposer(Proposer):
|
|||||||
def dummy_run(self,
|
def dummy_run(self,
|
||||||
num_tokens: int,
|
num_tokens: int,
|
||||||
with_prefill: bool = False,
|
with_prefill: bool = False,
|
||||||
skip_attn: bool = False,
|
in_graph_capturing: bool = False,
|
||||||
num_reqs: int = 0,
|
num_reqs: int = 0,
|
||||||
num_tokens_across_dp=None,
|
num_tokens_across_dp=None,
|
||||||
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
||||||
@@ -247,9 +247,7 @@ class MtpProposer(Proposer):
|
|||||||
moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type
|
moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type
|
||||||
== MoECommType.FUSED_ALLTOALL else moe_comm_type)
|
== MoECommType.FUSED_ALLTOALL else moe_comm_type)
|
||||||
|
|
||||||
if skip_attn:
|
if aclgraph_runtime_mode == CUDAGraphMode.FULL:
|
||||||
attn_metadata = None
|
|
||||||
elif aclgraph_runtime_mode == CUDAGraphMode.FULL:
|
|
||||||
if len(self.runner.attn_groups) > 0:
|
if len(self.runner.attn_groups) > 0:
|
||||||
num_computed_tokens_cpu = (
|
num_computed_tokens_cpu = (
|
||||||
self.runner.input_batch.
|
self.runner.input_batch.
|
||||||
@@ -294,7 +292,7 @@ class MtpProposer(Proposer):
|
|||||||
positions = self.positions[:num_tokens]
|
positions = self.positions[:num_tokens]
|
||||||
previous_hidden_states = self.hidden_states[:num_tokens]
|
previous_hidden_states = self.hidden_states[:num_tokens]
|
||||||
for i in range(self.num_speculative_tokens):
|
for i in range(self.num_speculative_tokens):
|
||||||
if i > 0 and not skip_attn and aclgraph_runtime_mode == CUDAGraphMode.FULL:
|
if i > 0 and not in_graph_capturing and aclgraph_runtime_mode == CUDAGraphMode.FULL:
|
||||||
aclgraph_runtime_mode = CUDAGraphMode.NONE
|
aclgraph_runtime_mode = CUDAGraphMode.NONE
|
||||||
with set_ascend_forward_context(
|
with set_ascend_forward_context(
|
||||||
attn_metadata,
|
attn_metadata,
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ class NgramProposer(VllmNgramProposer, Proposer):
|
|||||||
def dummy_run(self,
|
def dummy_run(self,
|
||||||
num_tokens,
|
num_tokens,
|
||||||
with_prefill=None,
|
with_prefill=None,
|
||||||
skip_attn=None,
|
in_graph_capturing=None,
|
||||||
num_reqs=None,
|
num_reqs=None,
|
||||||
num_tokens_across_dp=None,
|
num_tokens_across_dp=None,
|
||||||
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ class SuffixDecodingProposer(VllmSuffixDecodingProposer, Proposer):
|
|||||||
def dummy_run(self,
|
def dummy_run(self,
|
||||||
num_tokens,
|
num_tokens,
|
||||||
with_prefill=None,
|
with_prefill=None,
|
||||||
skip_attn=None,
|
in_graph_capturing=None,
|
||||||
num_reqs=None,
|
num_reqs=None,
|
||||||
num_tokens_across_dp=None,
|
num_tokens_across_dp=None,
|
||||||
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
||||||
|
|||||||
@@ -2296,7 +2296,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
||||||
batch_descriptor=batch_descriptor,
|
batch_descriptor=batch_descriptor,
|
||||||
dummy_compute_logits=dummy_drafter_compute_logits,
|
dummy_compute_logits=dummy_drafter_compute_logits,
|
||||||
skip_attn=not force_attention)
|
in_graph_capturing=not force_attention)
|
||||||
if self.in_profile_run and self.dynamic_eplb:
|
if self.in_profile_run and self.dynamic_eplb:
|
||||||
self.model.clear_all_moe_loads()
|
self.model.clear_all_moe_loads()
|
||||||
if not self.in_profile_run and self.dynamic_eplb:
|
if not self.in_profile_run and self.dynamic_eplb:
|
||||||
|
|||||||
Reference in New Issue
Block a user