upgrade to vllm 0.11.2 (#4400)

Bump vLLM version to v0.11.2 What's broken and changed by vLLM: 1. structured_output is broken by https://github.com/vllm-project/vllm/pull/26866 2. get_mrope_input_positions is broken by https://github.com/vllm-project/vllm/pull/28399 3. graph mode is broken by https://github.com/vllm-project/vllm/pull/25110 we'll upgrade torch to 2.8 to fix the problem later 4. embedding is broken by https://github.com/vllm-project/vllm/pull/27583 5. `get_attn_backend_cls` and attention backend is broken are broken by https://github.com/vllm-project/vllm/pull/28534 6. spec decode is broken by https://github.com/vllm-project/vllm/pull/28771 7. sp feature is broken by https://github.com/vllm-project/vllm/pull/27126 8. mtp is broken by https://github.com/vllm-project/vllm/pull/27922 9. lora is broken by https://github.com/vllm-project/vllm/pull/21068 10. execute_model is broken by https://github.com/vllm-project/vllm/pull/26866 11. `VLLM_DISABLE_SHARED_EXPERTS_STREAM` env is broken by https://github.com/vllm-project/vllm/pull/28159 12. kv cahe is broken by https://github.com/vllm-project/vllm/pull/27753 13. dp is broken by https://github.com/vllm-project/vllm/pull/25110 What's broken and changed by ourself: 1. qwen vl is broken by https://github.com/vllm-project/vllm/pull/28455 We'll remove model files in the future to avoid this kind of error 2. Engine core is broken by https://github.com/vllm-project/vllm/pull/23691 We'll remove the patch file in the future. 3. Ascend scheduler is broken by https://github.com/vllm-project/vllm/pull/28733 We'll remove ascend scheudler later. 4. qwen3-next is broken by https://github.com/vllm-project/vllm/pull/28083 We'll remove model files in the future to avoid this kind of error 5. qwen vl is broken by https://github.com/vllm-project/vllm/pull/27764. We'll remove model files in the future Known issue: 1. ray doesn't work 2. the accuracy of qwen3-next is not correct 3. qwen3-vl is broken 4. prefix cache+ ascend scheduler + deepseek v2 lite is broken. Co-authored-by: MengqingCao <cmq0113@163.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: leo-pony <nengjunma@outlook.com> Co-authored-by: 22dimensions <waitingwind@foxmail.com> Co-authored-by: shen-shanshan <467638484@qq.com> - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: hfadzxy <starmoon_zhang@163.com> Signed-off-by: leo-pony <nengjunma@outlook.com> Co-authored-by: MengqingCao <cmq0113@163.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: leo-pony <nengjunma@outlook.com>
2025-11-26 11:48:58 +08:00
parent d5f77f14d0
commit bc69d7cfe1
54 changed files with 744 additions and 437 deletions
--- a/tests/ut/attention/test_attention_v1.py
+++ b/tests/ut/attention/test_attention_v1.py
@@ -14,7 +14,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
 class TestAscendAttentionBackend(TestBase):

    def test_get_name(self):
-        self.assertEqual(AscendAttentionBackend.get_name(), "ASCEND")
+        self.assertEqual(AscendAttentionBackend.get_name(), "CUSTOM")

    def test_get_impl_cls(self):
        self.assertEqual(AscendAttentionBackend.get_impl_cls(),
--- a/tests/ut/compilation/test_acl_graph.py
+++ b/tests/ut/compilation/test_acl_graph.py
@@ -107,8 +107,7 @@ class TestACLGraphWrapper(TestBase):

        wrapper = ACLGraphWrapper(runnable=self.mock_runnable,
                                  vllm_config=self.mock_vllm_config,
-                                  runtime_mode=CUDAGraphMode.FULL,
-                                  graph_pool=self.mock_graph_pool)
+                                  runtime_mode=CUDAGraphMode.FULL)

        self.assertEqual(wrapper.runnable, self.mock_runnable)
        self.assertEqual(wrapper.vllm_config, self.mock_vllm_config)
@@ -130,7 +129,6 @@ class TestACLGraphWrapper(TestBase):
            runnable=self.mock_runnable,
            vllm_config=self.mock_vllm_config,
            runtime_mode=CUDAGraphMode.FULL,
-            graph_pool=self.mock_graph_pool,
            cudagraph_options=self.mock_cudagraph_options)

        self.assertEqual(wrapper.runnable, self.mock_runnable)
@@ -152,8 +150,7 @@ class TestACLGraphWrapper(TestBase):
        with self.assertRaises(AssertionError):
            ACLGraphWrapper(runnable=self.mock_runnable,
                            vllm_config=self.mock_vllm_config,
-                            runtime_mode=CUDAGraphMode.NONE,
-                            graph_pool=self.mock_graph_pool)
+                            runtime_mode=CUDAGraphMode.NONE)

    @patch('vllm_ascend.compilation.acl_graph.get_forward_context')
    @patch('vllm_ascend.compilation.acl_graph.current_platform')
@@ -171,7 +168,6 @@ class TestACLGraphWrapper(TestBase):
            runnable=self.mock_runnable,
            vllm_config=self.mock_vllm_config,
            runtime_mode=CUDAGraphMode.FULL,
-            graph_pool=self.mock_graph_pool,
            cudagraph_options=self.mock_cudagraph_options)

        result = wrapper("arg1", "arg2")
@@ -196,7 +192,6 @@ class TestACLGraphWrapper(TestBase):
            runnable=self.mock_runnable,
            vllm_config=self.mock_vllm_config,
            runtime_mode=CUDAGraphMode.FULL,
-            graph_pool=self.mock_graph_pool,
            cudagraph_options=self.mock_cudagraph_options)

        result = wrapper("arg1", "arg2")
@@ -247,7 +242,6 @@ class TestACLGraphWrapper(TestBase):
            runnable=self.mock_runnable,
            vllm_config=self.mock_vllm_config,
            runtime_mode=CUDAGraphMode.FULL,
-            graph_pool=self.mock_graph_pool,
            cudagraph_options=self.mock_cudagraph_options)

        # Create a real torch tensor for the test, not a mock
@@ -319,7 +313,6 @@ class TestACLGraphWrapper(TestBase):
            runnable=self.mock_runnable,
            vllm_config=self.mock_vllm_config,
            runtime_mode=CUDAGraphMode.FULL,
-            graph_pool=self.mock_graph_pool,
            cudagraph_options=self.mock_cudagraph_options)

        # Create a real torch tensor for the test, not a mock
@@ -392,7 +385,6 @@ class TestACLGraphWrapper(TestBase):
            runnable=self.mock_runnable,
            vllm_config=self.mock_vllm_config,
            runtime_mode=CUDAGraphMode.FULL,
-            graph_pool=self.mock_graph_pool,
            cudagraph_options=self.mock_cudagraph_options)

        # First call to capture the graph
@@ -447,7 +439,6 @@ class TestACLGraphWrapper(TestBase):
            runnable=self.mock_runnable,
            vllm_config=self.mock_vllm_config,
            runtime_mode=CUDAGraphMode.FULL,
-            graph_pool=self.mock_graph_pool,
            cudagraph_options=self.mock_cudagraph_options)

        # First call to capture the graph
@@ -518,7 +509,6 @@ class TestACLGraphWrapper(TestBase):
            runnable=self.mock_runnable,
            vllm_config=self.mock_vllm_config,
            runtime_mode=CUDAGraphMode.FULL,
-            graph_pool=self.mock_graph_pool,
            cudagraph_options=self.mock_cudagraph_options)

        # Create a real torch tensor for the test, not a mock
@@ -588,7 +578,6 @@ class TestACLGraphWrapper(TestBase):
            runnable=self.mock_runnable,
            vllm_config=self.mock_vllm_config,
            runtime_mode=CUDAGraphMode.FULL,
-            graph_pool=self.mock_graph_pool,
            cudagraph_options=self.mock_cudagraph_options)

        # Create a real torch tensor for the test, not a mock
@@ -659,7 +648,6 @@ class TestACLGraphWrapper(TestBase):
                        runnable=self.mock_runnable,
                        vllm_config=self.mock_vllm_config,
                        runtime_mode=CUDAGraphMode.FULL,
-                        graph_pool=self.mock_graph_pool,
                        cudagraph_options=self.mock_cudagraph_options)

                    # Create a real torch tensor for the test, not a mock
@@ -680,7 +668,6 @@ class TestACLGraphWrapper(TestBase):
            runnable=mock_runnable,
            vllm_config=self.mock_vllm_config,
            runtime_mode=CUDAGraphMode.FULL,
-            graph_pool=self.mock_graph_pool,
            cudagraph_options=self.mock_cudagraph_options)

        # Should be able to access attributes of the runnable
@@ -699,7 +686,6 @@ class TestACLGraphWrapper(TestBase):
            runnable=mock_runnable,
            vllm_config=self.mock_vllm_config,
            runtime_mode=CUDAGraphMode.FULL,
-            graph_pool=self.mock_graph_pool,
            cudagraph_options=self.mock_cudagraph_options)

        # Should raise AttributeError for non-existent attributes
@@ -715,7 +701,6 @@ class TestACLGraphWrapper(TestBase):
            runnable=self.mock_runnable,
            vllm_config=self.mock_vllm_config,
            runtime_mode=CUDAGraphMode.FULL,
-            graph_pool=self.mock_graph_pool,
            cudagraph_options=self.mock_cudagraph_options)

        unwrapped = wrapper.unwrap()
--- a/tests/ut/core/test_schedule_config.py
+++ b/tests/ut/core/test_schedule_config.py
@@ -48,7 +48,7 @@ class TestAscendSchedulerConfig(TestBase):
                enable_chunked_prefill=False,
                policy="fcfs",
                scheduler_cls="vllm_ascend.core.scheduler.AscendScheduler",
-                max_num_batched_tokens=2048,
+                max_num_batched_tokens=8192,
                max_model_len=2048,
                max_long_partial_prefills=1,
                long_prefill_token_threshold=512,
@@ -58,8 +58,8 @@ class TestAscendSchedulerConfig(TestBase):
        self.assertEqual(ascend_config.policy, "fcfs")
        self.assertEqual(ascend_config.scheduler_cls,
                         "vllm_ascend.core.scheduler.AscendScheduler")
-        self.assertEqual(ascend_config.max_num_batched_tokens, 2048)
-        self.assertEqual(ascend_config.encoder_cache_size, 2048)
+        self.assertEqual(ascend_config.max_num_batched_tokens, 8192)
+        self.assertEqual(ascend_config.encoder_cache_size, 8192)
        self.assertEqual(ascend_config.max_long_partial_prefills, 1)
        self.assertEqual(ascend_config.long_prefill_token_threshold, 512)

@@ -69,7 +69,7 @@ class TestAscendSchedulerConfig(TestBase):
                self.basic_scheduler_config,
                AscendSchedulerConfig(
                    policy="custom_policy",
-                    max_num_batched_tokens=2048,
+                    max_num_batched_tokens=8192,
                    max_model_len=2048,
                ),
            )
@@ -86,7 +86,8 @@ class TestAscendSchedulerConfig(TestBase):

    def test_valid_config_with_multimodal(self):
        config = AscendSchedulerConfig.initialize_from_config(
-            SchedulerConfig(is_multimodal_model=True), {})
+            SchedulerConfig(is_multimodal_model=True,
+                            max_num_batched_tokens=8192), {})
        self.assertTrue(config.is_multimodal_model)

    def test_valid_config_with_chunked_prefill(self):
@@ -94,12 +95,12 @@ class TestAscendSchedulerConfig(TestBase):
            self.basic_scheduler_config,
            AscendSchedulerConfig(
                enable_chunked_prefill=True,
-                max_num_batched_tokens=2048,
-                max_model_len=4096,
+                max_num_batched_tokens=8192,
+                max_model_len=8192,
            ),
        )
-        self.assertEqual(ascend_config.max_num_batched_tokens, 2048)
-        self.assertEqual(ascend_config.max_model_len, 4096)
+        self.assertEqual(ascend_config.max_num_batched_tokens, 8192)
+        self.assertEqual(ascend_config.max_model_len, 8192)
        self.assertTrue(ascend_config.enable_chunked_prefill)

    def test_invalid_config_without_chunked_prefill(self):
@@ -109,7 +110,7 @@ class TestAscendSchedulerConfig(TestBase):
                AscendSchedulerConfig(
                    enable_chunked_prefill=False,
                    max_num_batched_tokens=2048,
-                    max_model_len=4096,
+                    max_model_len=8192,
                ),
            )
        self.assertIn(
@@ -117,7 +118,7 @@ class TestAscendSchedulerConfig(TestBase):
            str(context.exception),
        )
        self.assertIn("max_num_batched_tokens (2048)", str(context.exception))
-        self.assertIn("max_model_len (4096)", str(context.exception))
+        self.assertIn("max_model_len (8192)", str(context.exception))

    def test_initialize_from_config_with_pd_transfer(self):
        ascend_config = AscendSchedulerConfig.initialize_from_config(
@@ -125,7 +126,7 @@ class TestAscendSchedulerConfig(TestBase):
            AscendSchedulerConfig(
                enable_pd_transfer=True,
                decode_max_num_seqs=48,
-                max_num_batched_tokens=4096,
+                max_num_batched_tokens=8192,
                max_model_len=4096,
            ),
        )
--- a/tests/ut/core/test_scheduler.py
+++ b/tests/ut/core/test_scheduler.py
@@ -3,6 +3,7 @@
 from typing import Any, Dict, List, Optional, Tuple
 from unittest.mock import MagicMock, patch

+import numpy as np
 import torch
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                         SchedulerConfig, SpeculativeConfig, VllmConfig)
@@ -80,7 +81,10 @@ def make_output(scheduler):
        req.request_id: i
        for i, req in enumerate(scheduler.running)
    }
-    sampled_token_ids = [[1000]] * len(scheduler.running)
+    sampled_token_ids = [
+        np.array([1000], dtype=np.int64) for _ in scheduler.running
+    ]
+
    logprobs = None

    modelrunner_output = ModelRunnerOutput(
@@ -361,16 +365,15 @@ class TestAscendScheduler(TestBase):
                                           },
                                           num_common_prefix_blocks=0,
                                           finished_req_ids=set(),
-                                           free_encoder_mm_hashes=[],
-                                           structured_output_request_ids={},
-                                           grammar_bitmask=None)
+                                           free_encoder_mm_hashes=[])
        model_output = ModelRunnerOutput(
            req_ids=[req.request_id for req in requests],
            req_id_to_index={
                req.request_id: i
                for i, req in enumerate(requests)
            },
-            sampled_token_ids=[[EOS_TOKEN_ID], [10, 11]
+            sampled_token_ids=[np.array([EOS_TOKEN_ID]),
+                               np.array([10, 11])
                               ],  # First request hits EOS, second continues
            logprobs=None,
            prompt_logprobs_dict={},
@@ -414,17 +417,16 @@ class TestAscendScheduler(TestBase):
                                           },
                                           num_common_prefix_blocks=0,
                                           finished_req_ids=set(),
-                                           free_encoder_mm_hashes=[],
-                                           structured_output_request_ids={},
-                                           grammar_bitmask=None)
+                                           free_encoder_mm_hashes=[])
        model_output = ModelRunnerOutput(
            req_ids=[req.request_id for req in requests],
            req_id_to_index={
                req.request_id: i
                for i, req in enumerate(requests)
            },
-            sampled_token_ids=[[10, 42, 12],
-                               [13, 14]],  # First request hits stop token
+            sampled_token_ids=[np.array([10, 42, 12]),
+                               np.array([13, 14])
+                               ],  # First request hits stop token
            logprobs=None,
            prompt_logprobs_dict={},
            pooler_output=[])
@@ -466,17 +468,16 @@ class TestAscendScheduler(TestBase):
                                           },
                                           num_common_prefix_blocks=0,
                                           finished_req_ids=set(),
-                                           free_encoder_mm_hashes=[],
-                                           structured_output_request_ids={},
-                                           grammar_bitmask=None)
+                                           free_encoder_mm_hashes=[])
        model_output = ModelRunnerOutput(
            req_ids=[req.request_id for req in requests],
            req_id_to_index={
                req.request_id: i
                for i, req in enumerate(requests)
            },
-            sampled_token_ids=[[10, 11, 12],
-                               [13]],  # First request exceeds max_tokens
+            sampled_token_ids=[np.array([10, 11, 12]),
+                               np.array([13])
+                               ],  # First request exceeds max_tokens
            logprobs=None,
            prompt_logprobs_dict={},
            pooler_output=[])
@@ -511,13 +512,11 @@ class TestAscendScheduler(TestBase):
            },
            num_common_prefix_blocks=0,
            finished_req_ids=set(),
-            free_encoder_mm_hashes=[],
-            structured_output_request_ids={},
-            grammar_bitmask=None)
+            free_encoder_mm_hashes=[])
        model_output = ModelRunnerOutput(
            req_ids=[requests[0].request_id],
            req_id_to_index={requests[0].request_id: 0},
-            sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
+            sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
            logprobs=None,
            prompt_logprobs_dict={},
            pooler_output=[])
@@ -574,7 +573,7 @@ class TestAscendScheduler(TestBase):
            model_runner_output = ModelRunnerOutput(
                req_ids=[requests[0].request_id],
                req_id_to_index={requests[0].request_id: 0},
-                sampled_token_ids=[[0]],
+                sampled_token_ids=[np.array([0], dtype=np.int64)],
                logprobs=None,
                prompt_logprobs_dict={},
                pooler_output=[])
@@ -590,7 +589,7 @@ class TestAscendScheduler(TestBase):
            model_runner_output = ModelRunnerOutput(
                req_ids=[requests[1].request_id],
                req_id_to_index={requests[1].request_id: 0},
-                sampled_token_ids=[[0]],
+                sampled_token_ids=[np.array([0], dtype=np.int64)],
                logprobs=None,
                prompt_logprobs_dict={},
                pooler_output=[])
@@ -608,10 +607,12 @@ class TestAscendScheduler(TestBase):
        spec_tokens_list: List[List[List[int]]] = [[[1, 2, 3]], [[1, 2, 3]],
                                                   [[1, 2], [3]], [[1]], [[]],
                                                   [[1, 2, 3], [4, 5, 6]]]
-        output_tokens_list: List[List[List[int]]] = [[[1, 2, 3, 4]], [[1, 5]],
-                                                     [[1, 2, 5], [3, 4]],
-                                                     [[1, 2]], [[5]],
-                                                     [[1, 2, 7], [4, 8]]]
+        output_tokens_list: List[List[List[int]]] = [
+            [np.array([1, 2, 3, 4])], [np.array([1, 5])],
+            [np.array([1, 2, 5]), np.array([3, 4])], [np.array([1, 2])],
+            [np.array([5])], [np.array([1, 2, 7]),
+                              np.array([4, 8])]
+        ]
        expected_list: List[Tuple[int, int,
                                  int, List[int]]] = [(1, 3, 3, [1, 1, 1]),
                                                      (1, 3, 1, [1, 0, 0]),
@@ -649,7 +650,9 @@ class TestAscendScheduler(TestBase):
            model_runner_output = ModelRunnerOutput(
                req_ids=req_ids,
                req_id_to_index=req_to_index,
-                sampled_token_ids=[[0] for _ in range(len(requests))],
+                sampled_token_ids=[
+                    np.array([0]) for _ in range(len(requests))
+                ],
                logprobs=None,
                prompt_logprobs_dict={},
                pooler_output=[])
@@ -1054,16 +1057,15 @@ class TestSchedulerDynamicBatch(TestBase):
                                           },
                                           num_common_prefix_blocks=0,
                                           finished_req_ids=set(),
-                                           free_encoder_mm_hashes=[],
-                                           structured_output_request_ids={},
-                                           grammar_bitmask=None)
+                                           free_encoder_mm_hashes=[])
        model_output = ModelRunnerOutput(
            req_ids=[req.request_id for req in requests],
            req_id_to_index={
                req.request_id: i
                for i, req in enumerate(requests)
            },
-            sampled_token_ids=[[EOS_TOKEN_ID], [10, 11]
+            sampled_token_ids=[np.array([EOS_TOKEN_ID]),
+                               np.array([10, 11])
                               ],  # First request hits EOS, second continues
            logprobs=None,
            prompt_logprobs_dict={},
@@ -1107,17 +1109,16 @@ class TestSchedulerDynamicBatch(TestBase):
                                           },
                                           num_common_prefix_blocks=0,
                                           finished_req_ids=set(),
-                                           free_encoder_mm_hashes=[],
-                                           structured_output_request_ids={},
-                                           grammar_bitmask=None)
+                                           free_encoder_mm_hashes=[])
        model_output = ModelRunnerOutput(
            req_ids=[req.request_id for req in requests],
            req_id_to_index={
                req.request_id: i
                for i, req in enumerate(requests)
            },
-            sampled_token_ids=[[10, 42, 12],
-                               [13, 14]],  # First request hits stop token
+            sampled_token_ids=[np.array([10, 42, 12]),
+                               np.array([13, 14])
+                               ],  # First request hits stop token
            logprobs=None,
            prompt_logprobs_dict={},
            pooler_output=[])
@@ -1159,17 +1160,16 @@ class TestSchedulerDynamicBatch(TestBase):
                                           },
                                           num_common_prefix_blocks=0,
                                           finished_req_ids=set(),
-                                           free_encoder_mm_hashes=[],
-                                           structured_output_request_ids={},
-                                           grammar_bitmask=None)
+                                           free_encoder_mm_hashes=[])
        model_output = ModelRunnerOutput(
            req_ids=[req.request_id for req in requests],
            req_id_to_index={
                req.request_id: i
                for i, req in enumerate(requests)
            },
-            sampled_token_ids=[[10, 11, 12],
-                               [13]],  # First request exceeds max_tokens
+            sampled_token_ids=[np.array([10, 11, 12]),
+                               np.array([13])
+                               ],  # First request exceeds max_tokens
            logprobs=None,
            prompt_logprobs_dict={},
            pooler_output=[])
@@ -1204,13 +1204,11 @@ class TestSchedulerDynamicBatch(TestBase):
            },
            num_common_prefix_blocks=0,
            finished_req_ids=set(),
-            free_encoder_mm_hashes=[],
-            structured_output_request_ids={},
-            grammar_bitmask=None)
+            free_encoder_mm_hashes=[])
        model_output = ModelRunnerOutput(
            req_ids=[requests[0].request_id],
            req_id_to_index={requests[0].request_id: 0},
-            sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
+            sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
            logprobs=None,
            prompt_logprobs_dict={},
            pooler_output=[])
@@ -1267,7 +1265,7 @@ class TestSchedulerDynamicBatch(TestBase):
            model_runner_output = ModelRunnerOutput(
                req_ids=[requests[0].request_id],
                req_id_to_index={requests[0].request_id: 0},
-                sampled_token_ids=[[0]],
+                sampled_token_ids=[np.array([0])],
                logprobs=None,
                prompt_logprobs_dict={},
                pooler_output=[])
@@ -1283,7 +1281,7 @@ class TestSchedulerDynamicBatch(TestBase):
            model_runner_output = ModelRunnerOutput(
                req_ids=[requests[1].request_id],
                req_id_to_index={requests[1].request_id: 0},
-                sampled_token_ids=[[0]],
+                sampled_token_ids=[np.array([0])],
                logprobs=None,
                prompt_logprobs_dict={},
                pooler_output=[])
@@ -1301,10 +1299,12 @@ class TestSchedulerDynamicBatch(TestBase):
        spec_tokens_list: List[List[List[int]]] = [[[1, 2, 3]], [[1, 2, 3]],
                                                   [[1, 2], [3]], [[1]], [[]],
                                                   [[1, 2, 3], [4, 5, 6]]]
-        output_tokens_list: List[List[List[int]]] = [[[1, 2, 3, 4]], [[1, 5]],
-                                                     [[1, 2, 5], [3, 4]],
-                                                     [[1, 2]], [[5]],
-                                                     [[1, 2, 7], [4, 8]]]
+        output_tokens_list: List[List[List[int]]] = [
+            [np.array([1, 2, 3, 4])], [np.array([1, 5])],
+            [np.array([1, 2, 5]), np.array([3, 4])], [np.array([1, 2])],
+            [np.array([5])], [np.array([1, 2, 7]),
+                              np.array([4, 8])]
+        ]
        expected_list: List[Tuple[int, int,
                                  int, List[int]]] = [(1, 3, 3, [1, 1, 1]),
                                                      (1, 3, 1, [1, 0, 0]),
@@ -1342,7 +1342,9 @@ class TestSchedulerDynamicBatch(TestBase):
            model_runner_output = ModelRunnerOutput(
                req_ids=req_ids,
                req_id_to_index=req_to_index,
-                sampled_token_ids=[[0] for _ in range(len(requests))],
+                sampled_token_ids=[
+                    np.array([0]) for _ in range(len(requests))
+                ],
                logprobs=None,
                prompt_logprobs_dict={},
                pooler_output=[])
--- a/tests/ut/kv_connector/utils.py
+++ b/tests/ut/kv_connector/utils.py
@@ -6,6 +6,7 @@
 import os
 from typing import Any, Optional

+import numpy as np
 import torch
 from vllm import SamplingParams
 from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
@@ -188,7 +189,7 @@ def create_model_runner_output(

    # Make sampled tokens.
    sampled_token = EOS_TOKEN_ID if use_eos else 0
-    sampled_token_ids = [[sampled_token] for _ in req_ids]
+    sampled_token_ids = [np.array([sampled_token]) for _ in req_ids]

    # Make output data structure.
    extra_args = {}
--- a/tests/ut/test_platform.py
+++ b/tests/ut/test_platform.py
@@ -549,7 +549,6 @@ class TestNPUPlatform(TestBase):
            dtype="float16",
            kv_cache_dtype="float16",
            block_size=64,
-            use_v1=True,
            #use_sfa=False,
            use_mla=True,
        )
@@ -570,7 +569,6 @@ class TestNPUPlatform(TestBase):
            dtype="float16",
            kv_cache_dtype="float16",
            block_size=64,
-            use_v1=True,
            #use_sfa=False,
            use_mla=True,
        )
@@ -592,7 +590,6 @@ class TestNPUPlatform(TestBase):
            dtype="float16",
            kv_cache_dtype="float16",
            block_size=64,
-            use_v1=True,
            #use_sfa=False,
            use_mla=False,
        )
@@ -614,7 +611,6 @@ class TestNPUPlatform(TestBase):
            dtype="float16",
            kv_cache_dtype="float16",
            block_size=64,
-            use_v1=True,
            #use_sfa=False,
            use_mla=False,
        )
--- a/tests/ut/torchair/test_torchair_worker.py
+++ b/tests/ut/torchair/test_torchair_worker.py
@@ -57,6 +57,8 @@ class TestNPUTorchairWorker(TestBase):
            worker.model_config = MagicMock()
            worker.model_config.seed = 42
            worker.vllm_config = MagicMock()
+            worker.parallel_config = MagicMock()
+            worker.parallel_config.local_world_size = 0

            result = worker._init_device()

@@ -89,6 +91,8 @@ class TestNPUTorchairWorker(TestBase):
            worker.model_config = MagicMock()
            worker.model_config.seed = 42
            worker.vllm_config = MagicMock()
+            worker.parallel_config = MagicMock()
+            worker.parallel_config.local_world_size = 0

            result = worker._init_device()

--- a/tests/ut/worker/test_worker_v1.py
+++ b/tests/ut/worker/test_worker_v1.py
@@ -327,6 +327,8 @@ class TestNPUWorker(TestBase):
            worker = NPUWorker()
            worker.local_rank = 1
            worker.model_config = MagicMock()
+            worker.parallel_config = MagicMock()
+            worker.parallel_config.local_world_size = 0
            worker.model_config.seed = 42

            # Test _init_device