[3/N][Refactor] torchair model runner refactor (#2207)

There is lot of torchair code in model runner leading the code hard for maintenance. We'll create new torchair_model_runner to split torchair related logic. Following the workflow #2203, this is the first PR. What's this PR do: create common function `_build_attention_metadata` and `_generate_dummy_run_hidden_states` for dummy_run - vLLM version: v0.10.0 - vLLM main: ebf7605b0d Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-08-11 18:03:19 +08:00
parent 29aaba5f84
commit 881e36d6a9
2 changed files with 89 additions and 66 deletions
--- a/vllm_ascend/torchair/torchair_model_runner.py
+++ b/vllm_ascend/torchair/torchair_model_runner.py
@@ -21,7 +21,10 @@ from typing import Optional

 import torch
 from vllm.config import VllmConfig
+from vllm.forward_context import get_forward_context

+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ,
+                               maybe_converting_weight_acl_format)
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner


@@ -55,3 +58,58 @@ class NPUTorchairModelRunner(NPUModelRunner):
            maybe_padded_num_tokens = num_tokens

        return maybe_padded_num_tokens, num_tokens_across_dp, with_prefill, enable_dbo
+
+    def _build_attention_metadata(self, with_prefill, num_reqs, skip_attn):
+        # NOTE: If torchair graph mode and not with_prefill,
+        # we can't skip_attn, it will cause graph recompile.
+        if not with_prefill:
+            attn_metadata = self.attn_metadata_builder.build_torchair_graph_dummy(
+                num_reqs=num_reqs, num_actual_tokens=1)
+        else:
+            attn_metadata = super()._build_attention_metadata(
+                with_prefill, num_reqs, skip_attn)
+        return attn_metadata
+
+    def _generate_dummy_run_hidden_states(self, with_prefill,
+                                          is_torchair_compile, input_ids,
+                                          positions, attn_metadata, num_tokens,
+                                          intermediate_tensors, inputs_embeds):
+
+        if not with_prefill:
+            # Only mark static while compiling
+            if is_torchair_compile:
+                torch._dynamo.mark_static(input_ids)
+                torch._dynamo.mark_static(positions)
+                torch._dynamo.mark_static(attn_metadata.decode.block_table)
+                torch._dynamo.mark_static(attn_metadata.decode.input_positions)
+                torch._dynamo.mark_static(get_forward_context().mc2_mask)
+                if hasattr(attn_metadata.decode, "sin"):
+                    torch._dynamo.mark_static(attn_metadata.decode.sin)
+                    torch._dynamo.mark_static(attn_metadata.decode.cos)
+                torch._dynamo.mark_static(attn_metadata.slot_mapping)
+                if self.speculative_config:
+                    torch._dynamo.mark_static(attn_metadata.decode.attn_mask)
+                for kv in self.kv_caches:
+                    assert isinstance(kv, tuple), "kv_cache must be a tuple"
+                    torch._dynamo.mark_static(kv[0])
+                    torch._dynamo.mark_static(kv[1])
+
+            maybe_converting_weight_acl_format(self.model,
+                                               ACL_FORMAT_FRACTAL_NZ)
+
+            compiled_model = self._get_torchair_lazy_compiled_model(num_tokens)
+            model_kwargs = {}
+            model_kwargs["kv_caches"] = self.kv_caches
+            model_kwargs["attn_metadata"] = attn_metadata
+            hidden_states = compiled_model(
+                input_ids=input_ids,
+                positions=positions,
+                intermediate_tensors=intermediate_tensors,
+                inputs_embeds=None,
+                **model_kwargs,
+            )
+        else:
+            hidden_states = super()._generate_dummy_run_hidden_states(
+                with_prefill, is_torchair_compile, input_ids, positions,
+                attn_metadata, num_tokens, intermediate_tensors, inputs_embeds)
+        return hidden_states