[CI] Upgrade vllm to 0.9.1 (#1165)

1. upgrade vllm to 0.9.1. 0.9.0 is not supported for main branch now. keep doc to 0.9.0 until we release the first 0.9.1 release. 2. disable V0 test for PR 3. move actionlint check to lint job Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-06-11 16:33:11 +08:00
parent e46dc142bf
commit 4f5964420e
19 changed files with 72 additions and 320 deletions
--- a/tests/singlecard/compile/test_simple.py
+++ b/tests/singlecard/compile/test_simple.py
@@ -14,8 +14,6 @@ from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
                         set_current_vllm_config)
 from vllm.utils import direct_register_custom_op

-from vllm_ascend.utils import vllm_version_is
-
 global_counter = 0

 # create a library to hold the custom op
@@ -93,28 +91,14 @@ def test_simple_piecewise_compile():
        model = SillyModel(vllm_config=vllm_config, prefix="")

    inputs = torch.randn(100).npu()
-
-    if vllm_version_is("0.9.0"):
-        kwargs = {
-            "num_graphs_seen": 1,  # one graph for the model
-            "num_piecewise_graphs_seen": 5,  # 2 * num_layers + 1
-            "num_piecewise_capturable_graphs_seen": 3,  # 1 + num_layers
-            "num_backend_compilations":
-            3,  # num_piecewise_capturable_graphs_seen
-            "num_cudagraph_caputured":
-            6  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-        }
-    else:
-        kwargs = {
-            "num_graphs_seen": 1,  # one graph for the model
-            "num_piecewise_graphs_seen": 5,  # 2 * num_layers + 1
-            "num_piecewise_capturable_graphs_seen": 3,  # 1 + num_layers
-            "num_backend_compilations":
-            3,  # num_piecewise_capturable_graphs_seen
-            "num_cudagraph_captured":
-            6  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-        }
-
+    kwargs = {
+        "num_graphs_seen": 1,  # one graph for the model
+        "num_piecewise_graphs_seen": 5,  # 2 * num_layers + 1
+        "num_piecewise_capturable_graphs_seen": 3,  # 1 + num_layers
+        "num_backend_compilations": 3,  # num_piecewise_capturable_graphs_seen
+        "num_cudagraph_captured":
+        6  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    }
    with compilation_counter.expect(kwargs):

        model(inputs)
--- a/tests/singlecard/test_scheduler.py
+++ b/tests/singlecard/test_scheduler.py
@@ -31,7 +31,6 @@ from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager

 from vllm_ascend.core.scheduler import AscendScheduler
-from vllm_ascend.utils import vllm_version_is

 EOS_TOKEN_ID = 50256

@@ -87,27 +86,15 @@ def create_scheduler(
    vllm_config = VllmConfig(scheduler_config=scheduler_config,
                             model_config=model_config,
                             cache_config=cache_config)
-
-    if vllm_version_is("0.9.0"):
-        kv_cache_config = KVCacheConfig(
-            num_blocks=10000,  # A large number of blocks to hold all requests
-            tensors={},
-            kv_cache_groups=[
-                KVCacheGroupSpec(['layer'],
-                                 FullAttentionSpec(16, 1, 1, torch.float32,
-                                                   False))
-            ],
-        )
-    else:
-        kv_cache_config = KVCacheConfig(
-            num_blocks=10000,  # A large number of blocks to hold all requests
-            kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])],
-            kv_cache_groups=[
-                KVCacheGroupSpec(['layer'],
-                                 FullAttentionSpec(16, 1, 1, torch.float32,
-                                                   False, None))
-            ],
-        )
+    kv_cache_config = KVCacheConfig(
+        num_blocks=10000,  # A large number of blocks to hold all requests
+        kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])],
+        kv_cache_groups=[
+            KVCacheGroupSpec(['layer'],
+                             FullAttentionSpec(16, 1, 1, torch.float32, False,
+                                               None))
+        ],
+    )
    cache_config.num_gpu_blocks = 10000
    return AscendScheduler(
        vllm_config,
@@ -135,27 +122,15 @@ def create_requests(num_requests: int,
        else:
            mm_position = None
            mm_inputs = None
-        if vllm_version_is("0.9.0"):
-            request = Request(
-                request_id=f"{i}",
-                prompt_token_ids=[i] * num_tokens,
-                sampling_params=sampling_params,
-                multi_modal_inputs=mm_inputs,
-                multi_modal_placeholders=mm_position,
-                multi_modal_hashes=None,
-                arrival_time=0,
-                eos_token_id=EOS_TOKEN_ID,
-            )
-        else:
-            request = Request(
-                request_id=f"{i}",
-                prompt_token_ids=[i] * num_tokens,
-                sampling_params=sampling_params,
-                multi_modal_inputs=mm_inputs,
-                multi_modal_placeholders=mm_position,
-                multi_modal_hashes=None,
-                eos_token_id=EOS_TOKEN_ID,
-            )
+        request = Request(
+            request_id=f"{i}",
+            prompt_token_ids=[i] * num_tokens,
+            sampling_params=sampling_params,
+            multi_modal_inputs=mm_inputs,
+            multi_modal_placeholders=mm_position,
+            multi_modal_hashes=None,
+            eos_token_id=EOS_TOKEN_ID,
+        )
        requests.append(request)
    return requests