upgrade vLLM to main (#4608)

1. fix https://github.com/vllm-project/vllm/pull/28542
The model structure modifications we involved in are:
     - Qwen2.5-VL(still exist some patch)
     - Qwen2-VL
     - Qwen2
     - DeepSeek series
     - Qwen-moe series
2. fix https://github.com/vllm-project/vllm/pull/29121
   the output token now  type changed from np to `list[list[int]]`

3. fix https://github.com/vllm-project/vllm/pull/29262
    `xformers` backend for multimodal now has been deprecated
4. fix https://github.com/vllm-project/vllm/pull/29342

5. fix https://github.com/vllm-project/vllm/pull/28579
6. fix https://github.com/vllm-project/vllm/pull/28718
7. fix https://github.com/vllm-project/vllm/issues/28665
8. fix https://github.com/vllm-project/vllm/pull/26847
vllm introduced the `optimization-level`, some default config has been
changed, and the param `--enforce-eager` has been deprecated
9. fix http://github.com/vllm-project/vllm/pull/29223 it retuns tuple
for sampler.
10. fix https://github.com/vllm-project/vllm/pull/29471 we'll remove the
related patch to avoid this kind of error.

Co-authored-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: wangli <wangli858794774@gmail.com>


- vLLM version: v0.11.2

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: wangli <wangli858794774@gmail.com>
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: wangli <wangli858794774@gmail.com>
Co-authored-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
wangxiyuan
2025-12-02 22:10:52 +08:00
committed by GitHub
parent 4588cdac02
commit 7f2673ea2d
60 changed files with 383 additions and 374 deletions

View File

@@ -176,17 +176,19 @@ class TestAscendSFATorchairMetadata(TestBase):
class TestAscendSFATorchairMetadataBuilder(TestBase):
def test_ascend_sfa_metadata_builder_default(self):
mock_vllm_config = MagicMock()
mock_vllm_config.model_config.max_model_len = 1024
mock_vllm_config.model_config.get_head_size.return_value = 64
mock_vllm_config.model_config.dtype = torch.float16
mock_vllm_config.cache_config.block_size = 16
mock_vllm_config.scheduler_config.max_num_seqs = 4
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
mock_device = 'cpu'
mock_model_config = MagicMock()
mock_model_config.max_model_len = 1024
mock_model_config.get_head_size.return_value = 64
mock_model_config.dtype = torch.float16
mock_vllm_config = MagicMock()
mock_vllm_config.model_config = mock_model_config
mock_vllm_config.cache_config = MagicMock(block_size=16)
mock_vllm_config.scheduler_config = MagicMock(
max_num_seqs=4, enable_chunked_prefill=False)
mock_vllm_config.speculative_config = None
mock_device = torch.device('cpu')
ascend_config = MagicMock()
ascend_config.torchair_graph_config = MagicMock()
ascend_config.torchair_graph_config.enabled = True
@@ -200,7 +202,7 @@ class TestAscendSFATorchairMetadataBuilder(TestBase):
mock_vllm_config.cache_config.block_size)
self.assertEqual(
builder.chunked_prefill_enabled,
mock_vllm_config.scheduler_config.chunked_prefill_enabled)
mock_vllm_config.scheduler_config.enable_chunked_prefill)
self.assertEqual(builder.torchair_graph_enabled, True)
self.assertEqual(builder.max_blocks, (mock_vllm_config.model_config.max_model_len +
mock_vllm_config.cache_config.block_size - 1) \
@@ -208,17 +210,22 @@ class TestAscendSFATorchairMetadataBuilder(TestBase):
@patch("vllm_ascend.torchair.torchair_sfa.get_ascend_config")
def test_reorder_batch_with_torchair_graph(self, ascend_config):
mock_model_config = MagicMock()
mock_model_config.max_model_len = 1024
mock_model_config.get_head_size.return_value = 64
mock_model_config.dtype = torch.float16
mock_vllm_config = MagicMock()
mock_vllm_config.model_config.max_model_len = 1024
mock_vllm_config.cache_config.block_size = 16
mock_vllm_config.scheduler_config.max_num_seqs = 4
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
mock_device = 'cpu'
mock_vllm_config.model_config = mock_model_config
mock_vllm_config.cache_config = MagicMock(block_size=16)
mock_vllm_config.scheduler_config = MagicMock(
max_num_seqs=4, enable_chunked_prefill=False)
mock_vllm_config.speculative_config = None
mock_device = torch.device('cpu')
ascend_config.torchair_graph_config = MagicMock()
ascend_config.torchair_graph_config.enabled = True
mock_vllm_config.speculative_config = None
builder = AscendSFATorchairMetadataBuilder(None, None,
mock_vllm_config,
mock_device)
@@ -247,13 +254,18 @@ class TestAscendSFATorchairMetadataBuilder(TestBase):
ascend_config = MagicMock()
mock_ascend_config.return_value = ascend_config
ascend_config.torchair_graph_config.enabled = False
mock_vllm_config = MagicMock()
mock_vllm_config.model_config.max_model_len = 1024
mock_vllm_config.cache_config.block_size = 16
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
mock_device = 'cpu'
mock_model_config = MagicMock()
mock_model_config.max_model_len = 1024
mock_model_config.get_head_size.return_value = 64
mock_model_config.dtype = torch.float16
mock_vllm_config = MagicMock()
mock_vllm_config.model_config = mock_model_config
mock_vllm_config.cache_config = MagicMock(block_size=16)
mock_vllm_config.scheduler_config = MagicMock(
max_num_seqs=4, enable_chunked_prefill=False)
mock_vllm_config.speculative_config = None
mock_device = torch.device('cpu')
builder = AscendSFATorchairMetadataBuilder(None, None,
mock_vllm_config,
@@ -270,18 +282,25 @@ class TestAscendSFATorchairMetadataBuilder(TestBase):
ascend_config = MagicMock()
mock_ascend_config.return_value = ascend_config
ascend_config.torchair_graph_config.enabled = False
mock_vllm_config = MagicMock()
mock_vllm_config.model_config.max_model_len = 64
mock_vllm_config.cache_config.block_size = 16
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
mock_device = 'cpu'
mock_model_config = MagicMock()
mock_model_config.max_model_len = 1024
mock_model_config.get_head_size.return_value = 64
mock_model_config.dtype = torch.float16
mock_vllm_config = MagicMock()
mock_vllm_config.model_config = mock_model_config
mock_vllm_config.cache_config = MagicMock(block_size=16)
mock_vllm_config.scheduler_config = MagicMock(
max_num_seqs=4, enable_chunked_prefill=False)
mock_vllm_config.speculative_config = None
mock_device = torch.device('cpu')
builder = AscendSFATorchairMetadataBuilder(None, None,
mock_vllm_config,
mock_device)
builder.max_blocks = 4
block_tables = torch.randint(0, 100, (3, 10), dtype=torch.int32)
result = builder._get_graph_runner_block_tables(3, block_tables)
@@ -295,14 +314,19 @@ class TestAscendSFATorchairMetadataBuilder(TestBase):
ascend_config = MagicMock()
mock_ascend_config.return_value = ascend_config
ascend_config.torchair_graph_config.enabled = False
mock_vllm_config = MagicMock()
mock_vllm_config.model_config.max_model_len = 1024
mock_vllm_config.cache_config.block_size = 16
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
mock_device = 'cpu'
mock_model_config = MagicMock()
mock_model_config.max_model_len = 1024
mock_model_config.get_head_size.return_value = 64
mock_model_config.dtype = torch.float16
mock_vllm_config = MagicMock()
mock_vllm_config.model_config = mock_model_config
mock_vllm_config.cache_config = MagicMock(block_size=16)
mock_vllm_config.scheduler_config = MagicMock(
max_num_seqs=4, enable_chunked_prefill=False)
mock_vllm_config.speculative_config = None
mock_device = torch.device('cpu')
builder = AscendSFATorchairMetadataBuilder(None, None,
mock_vllm_config,
mock_device)