### What this PR does / why we need it? This is a port PR of #3636 . Move the creation of dummy attention metadata to occur after the ACL graph runtime mode is determined. This ensures the metadata is initialized with the correct configuration during a profile run. Additionally, remove the `attn_metadata` existence check before updating MLA attention parameters. This change prevents the update from being skipped when metadata is not yet available, ensuring parameters are set correctly. ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? None. Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
This commit is contained in:
@@ -2331,7 +2331,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
forward_context = get_forward_context()
|
forward_context = get_forward_context()
|
||||||
assert forward_context is not None
|
assert forward_context is not None
|
||||||
if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL and \
|
if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL and \
|
||||||
not forward_context.capturing and forward_context.attn_metadata is not None:
|
not forward_context.capturing:
|
||||||
if self.vllm_config.model_config.use_mla:
|
if self.vllm_config.model_config.use_mla:
|
||||||
# FIXME: Try using `auto_dispatch_capture=True`
|
# FIXME: Try using `auto_dispatch_capture=True`
|
||||||
update_mla_attn_params(self.update_stream, forward_context,
|
update_mla_attn_params(self.update_stream, forward_context,
|
||||||
@@ -2419,17 +2419,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
if self.is_kv_producer and not self.is_kv_consumer:
|
if self.is_kv_producer and not self.is_kv_consumer:
|
||||||
with_prefill = True
|
with_prefill = True
|
||||||
|
|
||||||
# TODO(Mengqing): Set create_mixed_batch to False since it's only used in FI warmup
|
|
||||||
# and not supported in ASCEND now. We could remove it in the future.
|
|
||||||
attn_metadata = self._build_dummy_attn_metadata(
|
|
||||||
False,
|
|
||||||
num_reqs=num_reqs,
|
|
||||||
num_tokens=num_tokens,
|
|
||||||
max_query_len=max_query_len,
|
|
||||||
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
|
||||||
force_attention=force_attention,
|
|
||||||
)
|
|
||||||
|
|
||||||
if not self.in_profile_run and self.dynamic_eplb:
|
if not self.in_profile_run and self.dynamic_eplb:
|
||||||
self.eplb_updator.forward_before()
|
self.eplb_updator.forward_before()
|
||||||
|
|
||||||
@@ -2476,6 +2465,17 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
else:
|
else:
|
||||||
aclgraph_runtime_mode = _ag_mode
|
aclgraph_runtime_mode = _ag_mode
|
||||||
|
|
||||||
|
# TODO(Mengqing): Set create_mixed_batch to False since it's only used in FI warmup
|
||||||
|
# and not supported in ASCEND now. We could remove it in the future.
|
||||||
|
attn_metadata = self._build_dummy_attn_metadata(
|
||||||
|
False,
|
||||||
|
num_reqs=num_reqs,
|
||||||
|
num_tokens=num_tokens,
|
||||||
|
max_query_len=max_query_len,
|
||||||
|
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
||||||
|
force_attention=force_attention,
|
||||||
|
)
|
||||||
|
|
||||||
need_dummy_logits = (not self.in_profile_run
|
need_dummy_logits = (not self.in_profile_run
|
||||||
and lmhead_tp_enable())
|
and lmhead_tp_enable())
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user