Upgrade vLLM to v0.10.0 (#1927)

### What this PR does / why we need it? - Upgrade to v0.10.0 - Drop v0.9.2 version compatibility - Add patch for `vllm_ascend/patch/worker/patch_common/patch_sampler_gather_logprobs.py` as workaround of f3a683b7c9 for v0.10.0 and also add e2e test `test_models_prompt_logprobs` - Pin transformers<4.54.0 as workaround of https://github.com/vllm-project/vllm-ascend/issues/2034 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - Test locally: `VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/singlecard/test_offline_inference.py::test_models_prompt_logprobs` - CI passed - vLLM version: v0.9.2 - vLLM main: 7728dd77bb --------- Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
2025-07-26 15:43:29 +08:00
parent 2f50304c19
commit 17a430f7b8
29 changed files with 198 additions and 251 deletions
--- a/vllm_ascend/core/scheduler.py
+++ b/vllm_ascend/core/scheduler.py
@@ -32,8 +32,6 @@ from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager

-from vllm_ascend.utils import vllm_version_is
-

 class AscendScheduler(Scheduler):
    """This Scheduler extends vllm's original v1 scheduler
@@ -283,23 +281,12 @@ class AscendScheduler(Scheduler):
                    # allow the lower-priority requests to be scheduled.
                    req_index += 1
                    continue
-                if vllm_version_is("0.9.2"):
-                    num_draft_tokens = max(
-                        num_new_tokens + request.num_computed_tokens -
-                        request.num_tokens, 0)

                while True:
-                    if vllm_version_is("0.9.2"):
-                        new_blocks = self.kv_cache_manager.allocate_slots(
-                            request,
-                            num_new_tokens,
-                            num_draft_tokens=num_draft_tokens,
-                            num_lookahead_tokens=self.num_lookahead_tokens)
-                    else:
-                        new_blocks = self.kv_cache_manager.allocate_slots(
-                            request,
-                            num_new_tokens,
-                            num_lookahead_tokens=self.num_lookahead_tokens)
+                    new_blocks = self.kv_cache_manager.allocate_slots(
+                        request,
+                        num_new_tokens,
+                        num_lookahead_tokens=self.num_lookahead_tokens)
                    if new_blocks is None:
                        # The request cannot be scheduled.
                        # Preempt the lowest-priority request.