[Fix] Fix attention metadata handling for profiling and MLA (#3636)

### What this PR does / why we need it? Move the creation of dummy attention metadata to occur after the ACL graph runtime mode is determined. This ensures the metadata is initialized with the correct configuration during a profile run. Additionally, remove the `attn_metadata` existence check before updating MLA attention parameters. This change prevents the update from being skipped when metadata is not yet available, ensuring parameters are set correctly. ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? None. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-10-23 09:35:18 +08:00
parent b13d22bf5a
commit 4381d296e5
1 changed files with 12 additions and 12 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2331,7 +2331,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
        forward_context = get_forward_context()
        assert forward_context is not None
        if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL and \
-            not forward_context.capturing and forward_context.attn_metadata is not None:
+            not forward_context.capturing:
            if self.vllm_config.model_config.use_mla:
                # FIXME: Try using `auto_dispatch_capture=True`
                update_mla_attn_params(self.update_stream, forward_context,
@@ -2419,17 +2419,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
        if self.is_kv_producer and not self.is_kv_consumer:
            with_prefill = True

-        # TODO(Mengqing): Set create_mixed_batch to False since it's only used in FI warmup
-        # and not supported in ASCEND now. We could remove it in the future.
-        attn_metadata = self._build_dummy_attn_metadata(
-            False,
-            num_reqs=num_reqs,
-            num_tokens=num_tokens,
-            max_query_len=max_query_len,
-            aclgraph_runtime_mode=aclgraph_runtime_mode,
-            force_attention=force_attention,
-        )
-
        if not self.in_profile_run and self.dynamic_eplb:
            self.eplb_updator.forward_before()

@@ -2476,6 +2465,17 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            else:
                aclgraph_runtime_mode = _ag_mode

+            # TODO(Mengqing): Set create_mixed_batch to False since it's only used in FI warmup
+            # and not supported in ASCEND now. We could remove it in the future.
+            attn_metadata = self._build_dummy_attn_metadata(
+                False,
+                num_reqs=num_reqs,
+                num_tokens=num_tokens,
+                max_query_len=max_query_len,
+                aclgraph_runtime_mode=aclgraph_runtime_mode,
+                force_attention=force_attention,
+            )
+
            need_dummy_logits = (not self.in_profile_run
                                 and lmhead_tp_enable())