[Main2Main] Upgrade vLLM to 0226 (#6813)

### What this PR does / why we need it? Breaking: 1. https://github.com/vllm-project/vllm/pull/33452 2. https://github.com/vllm-project/vllm/pull/33451 3. https://github.com/vllm-project/vllm/pull/32567 4. https://github.com/vllm-project/vllm/pull/32344 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: 83b47f67b1 --------- Signed-off-by: MrZ20 <2609716663@qq.com> Signed-off-by: gcanlin <canlinguosdu@gmail.com> Co-authored-by: MrZ20 <2609716663@qq.com>
2026-02-27 16:05:21 +08:00
parent 80316c5824
commit e4458b2d2b
40 changed files with 117 additions and 184 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -138,12 +138,8 @@ if TYPE_CHECKING:
 else:
    xgr = LazyLoader("xgr", globals(), "xgrammar")

-from vllm_ascend.utils import vllm_version_is

-if vllm_version_is("v0.15.0"):
-    from vllm.attention.layer import Attention, MLAAttention  # type: ignore
-else:
-    from vllm.model_executor.layers.attention import Attention, MLAAttention
+from vllm.model_executor.layers.attention import Attention, MLAAttention

 # if true, allow tensor initialization and casting with internal format (e.g., NZ)
 torch.npu.config.allow_internal_format = True
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -531,6 +531,9 @@ class NPUWorker(WorkerBase):
    def pin_lora(self, lora_id: int) -> bool:
        return self.model_runner.pin_lora(lora_id)

+    def reset_encoder_cache(self) -> None:
+        self.model_runner.reset_encoder_cache()
+
    def execute_dummy_batch(self) -> None:
        self.model_runner._dummy_run(num_tokens=self.model_runner.decode_token_per_req, uniform_decode=True)