From 54bd531db8c21954a87d53d96a90d75ecddebb8c Mon Sep 17 00:00:00 2001
From: Yizhou <136800916+yiz-liu@users.noreply.github.com>
Date: Thu, 23 Oct 2025 10:29:30 +0800
Subject: [PATCH] [v0.11.0][Fix] Fix attention metadata handling for profiling
 and MLA (#3636) (#3643)

### What this PR does / why we need it?
This is a port PR of #3636 .

Move the creation of dummy attention metadata to occur after the ACL
graph runtime mode is determined. This ensures the metadata is
initialized with the correct configuration during a profile run.

Additionally, remove the `attn_metadata` existence check before updating
MLA attention parameters. This change prevents the update from being
skipped when metadata is not yet available, ensuring parameters are set
correctly.

### Does this PR introduce _any_ user-facing change? None.

### How was this patch tested?
None.

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
---
 vllm_ascend/worker/model_runner_v1.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index c7f74d0..0bfe0f8 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2331,7 +2331,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         forward_context = get_forward_context()
         assert forward_context is not None
         if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL and \
-            not forward_context.capturing and forward_context.attn_metadata is not None:
+            not forward_context.capturing:
             if self.vllm_config.model_config.use_mla:
                 # FIXME: Try using `auto_dispatch_capture=True`
                 update_mla_attn_params(self.update_stream, forward_context,
@@ -2419,17 +2419,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         if self.is_kv_producer and not self.is_kv_consumer:
             with_prefill = True
 
-        # TODO(Mengqing): Set create_mixed_batch to False since it's only used in FI warmup
-        # and not supported in ASCEND now. We could remove it in the future.
-        attn_metadata = self._build_dummy_attn_metadata(
-            False,
-            num_reqs=num_reqs,
-            num_tokens=num_tokens,
-            max_query_len=max_query_len,
-            aclgraph_runtime_mode=aclgraph_runtime_mode,
-            force_attention=force_attention,
-        )
-
         if not self.in_profile_run and self.dynamic_eplb:
             self.eplb_updator.forward_before()
 
@@ -2476,6 +2465,17 @@ class NPUModelRunner(LoRAModelRunnerMixin):
             else:
                 aclgraph_runtime_mode = _ag_mode
 
+            # TODO(Mengqing): Set create_mixed_batch to False since it's only used in FI warmup
+            # and not supported in ASCEND now. We could remove it in the future.
+            attn_metadata = self._build_dummy_attn_metadata(
+                False,
+                num_reqs=num_reqs,
+                num_tokens=num_tokens,
+                max_query_len=max_query_len,
+                aclgraph_runtime_mode=aclgraph_runtime_mode,
+                force_attention=force_attention,
+            )
+
             need_dummy_logits = (not self.in_profile_run
                                  and lmhead_tp_enable())