upgrade vLLM to main (#4608)

1. fix https://github.com/vllm-project/vllm/pull/28542 The model structure modifications we involved in are: - Qwen2.5-VL(still exist some patch) - Qwen2-VL - Qwen2 - DeepSeek series - Qwen-moe series 2. fix https://github.com/vllm-project/vllm/pull/29121 the output token now type changed from np to `list[list[int]]` 3. fix https://github.com/vllm-project/vllm/pull/29262 `xformers` backend for multimodal now has been deprecated 4. fix https://github.com/vllm-project/vllm/pull/29342 5. fix https://github.com/vllm-project/vllm/pull/28579 6. fix https://github.com/vllm-project/vllm/pull/28718 7. fix https://github.com/vllm-project/vllm/issues/28665 8. fix https://github.com/vllm-project/vllm/pull/26847 vllm introduced the `optimization-level`, some default config has been changed, and the param `--enforce-eager` has been deprecated 9. fix http://github.com/vllm-project/vllm/pull/29223 it retuns tuple for sampler. 10. fix https://github.com/vllm-project/vllm/pull/29471 we'll remove the related patch to avoid this kind of error. Co-authored-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: wangli <wangli858794774@gmail.com> - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com>
2025-12-02 22:10:52 +08:00
parent 4588cdac02
commit 7f2673ea2d
60 changed files with 383 additions and 374 deletions
--- a/tests/ut/attention/test_mla_v1.py
+++ b/tests/ut/attention/test_mla_v1.py
@@ -191,7 +191,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
        mock_vllm_config.cache_config.block_size = 16
        mock_vllm_config.scheduler_config.max_num_seqs = 4
        mock_vllm_config.scheduler_config.decode_max_num_seqs = 4
-        mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
+        mock_vllm_config.scheduler_config.enable_chunked_prefill = False
        mock_device = 'cpu'

        mock_dcp.world_size = 1
@@ -213,7 +213,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
                             mock_vllm_config.cache_config.block_size)
            self.assertEqual(
                builder.chunked_prefill_enabled,
-                mock_vllm_config.scheduler_config.chunked_prefill_enabled)
+                mock_vllm_config.scheduler_config.enable_chunked_prefill)

    @patch('vllm.distributed.parallel_state.get_dcp_group')
    @patch('vllm.distributed.parallel_state._DCP',
@@ -230,7 +230,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
        mock_vllm_config.cache_config.block_size = 16
        mock_vllm_config.scheduler_config.max_num_seqs = 4
        mock_vllm_config.scheduler_config.decode_max_num_seqs = 4
-        mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
+        mock_vllm_config.scheduler_config.enable_chunked_prefill = False
        mock_device = 'cpu'

        mock_dcp.world_size = 1
@@ -254,7 +254,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
                             mock_vllm_config.cache_config.block_size)
            self.assertEqual(
                builder.chunked_prefill_enabled,
-                mock_vllm_config.scheduler_config.chunked_prefill_enabled)
+                mock_vllm_config.scheduler_config.enable_chunked_prefill)

    @patch('vllm.distributed.parallel_state.get_dcp_group')
    @patch('vllm.distributed.parallel_state._DCP',
@@ -321,7 +321,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
        mock_vllm_config.cache_config.block_size = 16
        mock_vllm_config.scheduler_config.max_num_seqs = 4
        mock_vllm_config.scheduler_config.decode_max_num_seqs = 4
-        mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
+        mock_vllm_config.scheduler_config.enable_chunked_prefill = False
        mock_device = 'cpu'

        mock_dcp.world_size = 1
@@ -440,8 +440,10 @@ class TestAscendMLAMetadataBuilderBuild(TestBase):
        self.mock_vllm_config.model_config = ModelConfig(max_model_len=2048)
        self.mock_vllm_config.model_config.hf_text_config.qk_rope_head_dim = 32
        self.mock_vllm_config.cache_config = CacheConfig(block_size=32)
-        self.mock_vllm_config.scheduler_config = SchedulerConfig(
-            max_num_seqs=8, chunked_prefill_enabled=True)
+        mock_scheduler_config = MagicMock(spec=SchedulerConfig)
+        mock_scheduler_config.max_num_seqs = 8
+        mock_scheduler_config.chunked_prefill_enabled = True
+        self.mock_vllm_config.scheduler_config = mock_scheduler_config
        self.mock_vllm_config.speculative_config = None
        self.mock_device = torch.device("cpu")

@@ -454,12 +456,20 @@ class TestAscendMLAMetadataBuilderBuild(TestBase):
        "vllm_ascend.attention.mla_v1.get_decode_context_model_parallel_world_size"
    )
    @patch("vllm_ascend.attention.mla_v1.get_ascend_config")
-    def test_build_prefix_no_cache_metadata(self, mock_get_ascend_config,
+    @patch("vllm_ascend.attention.mla_v1.torch.zeros", wraps=torch.zeros)
+    @patch("torch.Tensor.npu", new=lambda self: self)
+    @patch("torch.npu.is_available")
+    def test_build_prefix_no_cache_metadata(self, mock_npu_available,
+                                            mock_zeros, mock_get_ascend_config,
                                            mock_dcp_world_size):
-        if not torch.npu.is_available():
-            self.skipTest("NPU not available, skipping NPU-dependent tests")
+        mock_npu_available.return_value = False
        mock_dcp_world_size.return_value = 1

+        def zeros_override(*args, **kwargs):
+            kwargs.pop('pin_memory', None)
+            return mock_zeros._mock_wraps(*args, **kwargs)
+
+        mock_zeros.side_effect = zeros_override
        common_attn_metadata = AscendCommonAttentionMetadata(
            query_start_loc=torch.tensor([0, 3, 7]),
            query_start_loc_cpu=torch.tensor([0, 3, 7]),
@@ -506,12 +516,21 @@ class TestAscendMLAMetadataBuilderBuild(TestBase):
        "vllm_ascend.attention.mla_v1.get_decode_context_model_parallel_world_size"
    )
    @patch("vllm_ascend.attention.mla_v1.get_ascend_config")
-    def test_build_chunked_prefix_metadata(self, mock_get_ascend_config,
+    @patch("vllm_ascend.attention.mla_v1.torch.zeros", wraps=torch.zeros)
+    @patch("torch.Tensor.npu", new=lambda self: self)
+    @patch("torch.npu.is_available")
+    def test_build_chunked_prefix_metadata(self, mock_npu_available,
+                                           mock_zeros, mock_get_ascend_config,
                                           mock_dcp_world_size):
-        if not torch.npu.is_available():
-            self.skipTest("NPU not available, skipping NPU-dependent tests")
+        mock_npu_available.return_value = False
        mock_dcp_world_size.return_value = 1

+        def zeros_override(*args, **kwargs):
+            kwargs.pop('pin_memory', None)
+            return mock_zeros._mock_wraps(*args, **kwargs)
+
+        mock_zeros.side_effect = zeros_override
+
        common_attn_metadata = AscendCommonAttentionMetadata(
            query_start_loc=torch.tensor([0, 2, 5, 9]),
            query_start_loc_cpu=torch.tensor([0, 2, 5, 9]),