From 4381d296e5a71edad7e72add11169f0dfeb668ec Mon Sep 17 00:00:00 2001
From: Yizhou <136800916+yiz-liu@users.noreply.github.com>
Date: Thu, 23 Oct 2025 09:35:18 +0800
Subject: [PATCH] [Fix] Fix attention metadata handling for profiling and MLA
 (#3636)

### What this PR does / why we need it?
Move the creation of dummy attention metadata to occur after the ACL
graph runtime mode is determined. This ensures the metadata is
initialized with the correct configuration during a profile run.

Additionally, remove the `attn_metadata` existence check before updating
MLA attention parameters. This change prevents the update from being
skipped when metadata is not yet available, ensuring parameters are set
correctly.

### Does this PR introduce _any_ user-facing change?
None.

### How was this patch tested?
None.

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
---
 vllm_ascend/worker/model_runner_v1.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index c7f74d0a..0bfe0f84 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2331,7 +2331,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         forward_context = get_forward_context()
         assert forward_context is not None
         if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL and \
-            not forward_context.capturing and forward_context.attn_metadata is not None:
+            not forward_context.capturing:
             if self.vllm_config.model_config.use_mla:
                 # FIXME: Try using `auto_dispatch_capture=True`
                 update_mla_attn_params(self.update_stream, forward_context,
@@ -2419,17 +2419,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         if self.is_kv_producer and not self.is_kv_consumer:
             with_prefill = True
 
-        # TODO(Mengqing): Set create_mixed_batch to False since it's only used in FI warmup
-        # and not supported in ASCEND now. We could remove it in the future.
-        attn_metadata = self._build_dummy_attn_metadata(
-            False,
-            num_reqs=num_reqs,
-            num_tokens=num_tokens,
-            max_query_len=max_query_len,
-            aclgraph_runtime_mode=aclgraph_runtime_mode,
-            force_attention=force_attention,
-        )
-
         if not self.in_profile_run and self.dynamic_eplb:
             self.eplb_updator.forward_before()
 
@@ -2476,6 +2465,17 @@ class NPUModelRunner(LoRAModelRunnerMixin):
             else:
                 aclgraph_runtime_mode = _ag_mode
 
+            # TODO(Mengqing): Set create_mixed_batch to False since it's only used in FI warmup
+            # and not supported in ASCEND now. We could remove it in the future.
+            attn_metadata = self._build_dummy_attn_metadata(
+                False,
+                num_reqs=num_reqs,
+                num_tokens=num_tokens,
+                max_query_len=max_query_len,
+                aclgraph_runtime_mode=aclgraph_runtime_mode,
+                force_attention=force_attention,
+            )
+
             need_dummy_logits = (not self.in_profile_run
                                  and lmhead_tp_enable())