[Fix] Fix attention metadata handling for profiling and MLA (#3636)
### What this PR does / why we need it? Move the creation of dummy attention metadata to occur after the ACL graph runtime mode is determined. This ensures the metadata is initialized with the correct configuration during a profile run. Additionally, remove the `attn_metadata` existence check before updating MLA attention parameters. This change prevents the update from being skipped when metadata is not yet available, ensuring parameters are set correctly. ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? None. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
This commit is contained in:
@@ -2331,7 +2331,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
forward_context = get_forward_context()
|
||||
assert forward_context is not None
|
||||
if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL and \
|
||||
not forward_context.capturing and forward_context.attn_metadata is not None:
|
||||
not forward_context.capturing:
|
||||
if self.vllm_config.model_config.use_mla:
|
||||
# FIXME: Try using `auto_dispatch_capture=True`
|
||||
update_mla_attn_params(self.update_stream, forward_context,
|
||||
@@ -2419,17 +2419,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
if self.is_kv_producer and not self.is_kv_consumer:
|
||||
with_prefill = True
|
||||
|
||||
# TODO(Mengqing): Set create_mixed_batch to False since it's only used in FI warmup
|
||||
# and not supported in ASCEND now. We could remove it in the future.
|
||||
attn_metadata = self._build_dummy_attn_metadata(
|
||||
False,
|
||||
num_reqs=num_reqs,
|
||||
num_tokens=num_tokens,
|
||||
max_query_len=max_query_len,
|
||||
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
||||
force_attention=force_attention,
|
||||
)
|
||||
|
||||
if not self.in_profile_run and self.dynamic_eplb:
|
||||
self.eplb_updator.forward_before()
|
||||
|
||||
@@ -2476,6 +2465,17 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
else:
|
||||
aclgraph_runtime_mode = _ag_mode
|
||||
|
||||
# TODO(Mengqing): Set create_mixed_batch to False since it's only used in FI warmup
|
||||
# and not supported in ASCEND now. We could remove it in the future.
|
||||
attn_metadata = self._build_dummy_attn_metadata(
|
||||
False,
|
||||
num_reqs=num_reqs,
|
||||
num_tokens=num_tokens,
|
||||
max_query_len=max_query_len,
|
||||
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
||||
force_attention=force_attention,
|
||||
)
|
||||
|
||||
need_dummy_logits = (not self.in_profile_run
|
||||
and lmhead_tp_enable())
|
||||
|
||||
|
||||
Reference in New Issue
Block a user