[Feature] model_runner refactor (#4764)

### What this PR does / why we need it? refactor npu_modelrunner， we should be close to gpu_modelrunner ### Does this PR introduce _any_ user-facing change? NO - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com> Signed-off-by: zhenwenqi2024 <155598497+zhenwenqi2024@users.noreply.github.com>
2025-12-12 17:27:09 +08:00
parent 5b12c068f9
commit f708d919f8
10 changed files with 676 additions and 1815 deletions
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -25,6 +25,7 @@ from vllm.v1.attention.backends.utils import AttentionCGSupport

 from vllm_ascend import envs
 from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.ascend_forward_context import get_cos_and_sin
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
                                         maybe_save_kv_layer_to_connector,
@@ -625,8 +626,7 @@ class AscendMLAMetadataBuilder:

        decode_metadata = None
        if num_decodes > 0:
-            cos = common_attn_metadata.cos
-            sin = common_attn_metadata.sin
+            cos, sin = get_cos_and_sin()
            # Notice that num_decodes != num_decode_tokens in SpecDecoding Scenario
            actual_seq_lengths_q = query_start_loc_cpu[1:num_decodes +
                                                       1].tolist()
--- a/vllm_ascend/attention/sfa_v1.py
+++ b/vllm_ascend/attention/sfa_v1.py
@@ -16,6 +16,7 @@ from vllm.v1.attention.backends.utils import AttentionCGSupport

 from vllm_ascend import envs
 from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.ascend_forward_context import get_cos_and_sin
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.attention.mla_v1 import MAX_O_PROJ_PREFETCH_SIZE
 from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
@@ -186,8 +187,7 @@ class AscendSFAMetadataBuilder:
        cum_query_lens = common_attn_metadata.query_start_loc[1:num_reqs + 1]
        seq_lens = common_attn_metadata.seq_lens[:num_reqs]

-        cos = common_attn_metadata.cos
-        sin = common_attn_metadata.sin
+        cos, sin = get_cos_and_sin()

        assert self.cos_cache is not None and self.sin_cache is not None
        new_cos = self.cos_cache[input_positions][:, None, None]
--- a/vllm_ascend/attention/utils.py
+++ b/vllm_ascend/attention/utils.py
@@ -100,10 +100,6 @@ class AscendCommonAttentionMetadata:
    # padding tokens. It is used to handle some padding operations.
    num_input_tokens: int = 0

-    # NOTE: This is a temporary solution for rotary embedding in MLA
-    cos: torch.Tensor = None
-    sin: torch.Tensor = None
-
    prefill_context_parallel_metadata: Optional[
        AscendPrefillContextParallelMetadata] = None