[Feature] model_runner refactor (#4764)

### What this PR does / why we need it? refactor npu_modelrunner， we should be close to gpu_modelrunner ### Does this PR introduce _any_ user-facing change? NO - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com> Signed-off-by: zhenwenqi2024 <155598497+zhenwenqi2024@users.noreply.github.com>
2025-12-12 17:27:09 +08:00
parent 5b12c068f9
commit f708d919f8
10 changed files with 676 additions and 1815 deletions
--- a/vllm_ascend/attention/sfa_v1.py
+++ b/vllm_ascend/attention/sfa_v1.py
@@ -16,6 +16,7 @@ from vllm.v1.attention.backends.utils import AttentionCGSupport

 from vllm_ascend import envs
 from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.ascend_forward_context import get_cos_and_sin
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.attention.mla_v1 import MAX_O_PROJ_PREFETCH_SIZE
 from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
@@ -186,8 +187,7 @@ class AscendSFAMetadataBuilder:
        cum_query_lens = common_attn_metadata.query_start_loc[1:num_reqs + 1]
        seq_lens = common_attn_metadata.seq_lens[:num_reqs]

-        cos = common_attn_metadata.cos
-        sin = common_attn_metadata.sin
+        cos, sin = get_cos_and_sin()

        assert self.cos_cache is not None and self.sin_cache is not None
        new_cos = self.cos_cache[input_positions][:, None, None]