upgrade to vllm 0.11.2 (#4400)
Bump vLLM version to v0.11.2 What's broken and changed by vLLM: 1. structured_output is broken by https://github.com/vllm-project/vllm/pull/26866 2. get_mrope_input_positions is broken by https://github.com/vllm-project/vllm/pull/28399 3. graph mode is broken by https://github.com/vllm-project/vllm/pull/25110 we'll upgrade torch to 2.8 to fix the problem later 4. embedding is broken by https://github.com/vllm-project/vllm/pull/27583 5. `get_attn_backend_cls` and attention backend is broken are broken by https://github.com/vllm-project/vllm/pull/28534 6. spec decode is broken by https://github.com/vllm-project/vllm/pull/28771 7. sp feature is broken by https://github.com/vllm-project/vllm/pull/27126 8. mtp is broken by https://github.com/vllm-project/vllm/pull/27922 9. lora is broken by https://github.com/vllm-project/vllm/pull/21068 10. execute_model is broken by https://github.com/vllm-project/vllm/pull/26866 11. `VLLM_DISABLE_SHARED_EXPERTS_STREAM` env is broken by https://github.com/vllm-project/vllm/pull/28159 12. kv cahe is broken by https://github.com/vllm-project/vllm/pull/27753 13. dp is broken by https://github.com/vllm-project/vllm/pull/25110 What's broken and changed by ourself: 1. qwen vl is broken by https://github.com/vllm-project/vllm/pull/28455 We'll remove model files in the future to avoid this kind of error 2. Engine core is broken by https://github.com/vllm-project/vllm/pull/23691 We'll remove the patch file in the future. 3. Ascend scheduler is broken by https://github.com/vllm-project/vllm/pull/28733 We'll remove ascend scheudler later. 4. qwen3-next is broken by https://github.com/vllm-project/vllm/pull/28083 We'll remove model files in the future to avoid this kind of error 5. qwen vl is broken by https://github.com/vllm-project/vllm/pull/27764. We'll remove model files in the future Known issue: 1. ray doesn't work 2. the accuracy of qwen3-next is not correct 3. qwen3-vl is broken 4. prefix cache+ ascend scheduler + deepseek v2 lite is broken. Co-authored-by: MengqingCao <cmq0113@163.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: leo-pony <nengjunma@outlook.com> Co-authored-by: 22dimensions <waitingwind@foxmail.com> Co-authored-by: shen-shanshan <467638484@qq.com> - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: hfadzxy <starmoon_zhang@163.com> Signed-off-by: leo-pony <nengjunma@outlook.com> Co-authored-by: MengqingCao <cmq0113@163.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: leo-pony <nengjunma@outlook.com>
This commit is contained in:
@@ -14,7 +14,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
|
||||
class TestAscendAttentionBackend(TestBase):
|
||||
|
||||
def test_get_name(self):
|
||||
self.assertEqual(AscendAttentionBackend.get_name(), "ASCEND")
|
||||
self.assertEqual(AscendAttentionBackend.get_name(), "CUSTOM")
|
||||
|
||||
def test_get_impl_cls(self):
|
||||
self.assertEqual(AscendAttentionBackend.get_impl_cls(),
|
||||
|
||||
@@ -107,8 +107,7 @@ class TestACLGraphWrapper(TestBase):
|
||||
|
||||
wrapper = ACLGraphWrapper(runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool)
|
||||
runtime_mode=CUDAGraphMode.FULL)
|
||||
|
||||
self.assertEqual(wrapper.runnable, self.mock_runnable)
|
||||
self.assertEqual(wrapper.vllm_config, self.mock_vllm_config)
|
||||
@@ -130,7 +129,6 @@ class TestACLGraphWrapper(TestBase):
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
self.assertEqual(wrapper.runnable, self.mock_runnable)
|
||||
@@ -152,8 +150,7 @@ class TestACLGraphWrapper(TestBase):
|
||||
with self.assertRaises(AssertionError):
|
||||
ACLGraphWrapper(runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.NONE,
|
||||
graph_pool=self.mock_graph_pool)
|
||||
runtime_mode=CUDAGraphMode.NONE)
|
||||
|
||||
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
|
||||
@patch('vllm_ascend.compilation.acl_graph.current_platform')
|
||||
@@ -171,7 +168,6 @@ class TestACLGraphWrapper(TestBase):
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
result = wrapper("arg1", "arg2")
|
||||
@@ -196,7 +192,6 @@ class TestACLGraphWrapper(TestBase):
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
result = wrapper("arg1", "arg2")
|
||||
@@ -247,7 +242,6 @@ class TestACLGraphWrapper(TestBase):
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
# Create a real torch tensor for the test, not a mock
|
||||
@@ -319,7 +313,6 @@ class TestACLGraphWrapper(TestBase):
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
# Create a real torch tensor for the test, not a mock
|
||||
@@ -392,7 +385,6 @@ class TestACLGraphWrapper(TestBase):
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
# First call to capture the graph
|
||||
@@ -447,7 +439,6 @@ class TestACLGraphWrapper(TestBase):
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
# First call to capture the graph
|
||||
@@ -518,7 +509,6 @@ class TestACLGraphWrapper(TestBase):
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
# Create a real torch tensor for the test, not a mock
|
||||
@@ -588,7 +578,6 @@ class TestACLGraphWrapper(TestBase):
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
# Create a real torch tensor for the test, not a mock
|
||||
@@ -659,7 +648,6 @@ class TestACLGraphWrapper(TestBase):
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
# Create a real torch tensor for the test, not a mock
|
||||
@@ -680,7 +668,6 @@ class TestACLGraphWrapper(TestBase):
|
||||
runnable=mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
# Should be able to access attributes of the runnable
|
||||
@@ -699,7 +686,6 @@ class TestACLGraphWrapper(TestBase):
|
||||
runnable=mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
# Should raise AttributeError for non-existent attributes
|
||||
@@ -715,7 +701,6 @@ class TestACLGraphWrapper(TestBase):
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
unwrapped = wrapper.unwrap()
|
||||
|
||||
@@ -48,7 +48,7 @@ class TestAscendSchedulerConfig(TestBase):
|
||||
enable_chunked_prefill=False,
|
||||
policy="fcfs",
|
||||
scheduler_cls="vllm_ascend.core.scheduler.AscendScheduler",
|
||||
max_num_batched_tokens=2048,
|
||||
max_num_batched_tokens=8192,
|
||||
max_model_len=2048,
|
||||
max_long_partial_prefills=1,
|
||||
long_prefill_token_threshold=512,
|
||||
@@ -58,8 +58,8 @@ class TestAscendSchedulerConfig(TestBase):
|
||||
self.assertEqual(ascend_config.policy, "fcfs")
|
||||
self.assertEqual(ascend_config.scheduler_cls,
|
||||
"vllm_ascend.core.scheduler.AscendScheduler")
|
||||
self.assertEqual(ascend_config.max_num_batched_tokens, 2048)
|
||||
self.assertEqual(ascend_config.encoder_cache_size, 2048)
|
||||
self.assertEqual(ascend_config.max_num_batched_tokens, 8192)
|
||||
self.assertEqual(ascend_config.encoder_cache_size, 8192)
|
||||
self.assertEqual(ascend_config.max_long_partial_prefills, 1)
|
||||
self.assertEqual(ascend_config.long_prefill_token_threshold, 512)
|
||||
|
||||
@@ -69,7 +69,7 @@ class TestAscendSchedulerConfig(TestBase):
|
||||
self.basic_scheduler_config,
|
||||
AscendSchedulerConfig(
|
||||
policy="custom_policy",
|
||||
max_num_batched_tokens=2048,
|
||||
max_num_batched_tokens=8192,
|
||||
max_model_len=2048,
|
||||
),
|
||||
)
|
||||
@@ -86,7 +86,8 @@ class TestAscendSchedulerConfig(TestBase):
|
||||
|
||||
def test_valid_config_with_multimodal(self):
|
||||
config = AscendSchedulerConfig.initialize_from_config(
|
||||
SchedulerConfig(is_multimodal_model=True), {})
|
||||
SchedulerConfig(is_multimodal_model=True,
|
||||
max_num_batched_tokens=8192), {})
|
||||
self.assertTrue(config.is_multimodal_model)
|
||||
|
||||
def test_valid_config_with_chunked_prefill(self):
|
||||
@@ -94,12 +95,12 @@ class TestAscendSchedulerConfig(TestBase):
|
||||
self.basic_scheduler_config,
|
||||
AscendSchedulerConfig(
|
||||
enable_chunked_prefill=True,
|
||||
max_num_batched_tokens=2048,
|
||||
max_model_len=4096,
|
||||
max_num_batched_tokens=8192,
|
||||
max_model_len=8192,
|
||||
),
|
||||
)
|
||||
self.assertEqual(ascend_config.max_num_batched_tokens, 2048)
|
||||
self.assertEqual(ascend_config.max_model_len, 4096)
|
||||
self.assertEqual(ascend_config.max_num_batched_tokens, 8192)
|
||||
self.assertEqual(ascend_config.max_model_len, 8192)
|
||||
self.assertTrue(ascend_config.enable_chunked_prefill)
|
||||
|
||||
def test_invalid_config_without_chunked_prefill(self):
|
||||
@@ -109,7 +110,7 @@ class TestAscendSchedulerConfig(TestBase):
|
||||
AscendSchedulerConfig(
|
||||
enable_chunked_prefill=False,
|
||||
max_num_batched_tokens=2048,
|
||||
max_model_len=4096,
|
||||
max_model_len=8192,
|
||||
),
|
||||
)
|
||||
self.assertIn(
|
||||
@@ -117,7 +118,7 @@ class TestAscendSchedulerConfig(TestBase):
|
||||
str(context.exception),
|
||||
)
|
||||
self.assertIn("max_num_batched_tokens (2048)", str(context.exception))
|
||||
self.assertIn("max_model_len (4096)", str(context.exception))
|
||||
self.assertIn("max_model_len (8192)", str(context.exception))
|
||||
|
||||
def test_initialize_from_config_with_pd_transfer(self):
|
||||
ascend_config = AscendSchedulerConfig.initialize_from_config(
|
||||
@@ -125,7 +126,7 @@ class TestAscendSchedulerConfig(TestBase):
|
||||
AscendSchedulerConfig(
|
||||
enable_pd_transfer=True,
|
||||
decode_max_num_seqs=48,
|
||||
max_num_batched_tokens=4096,
|
||||
max_num_batched_tokens=8192,
|
||||
max_model_len=4096,
|
||||
),
|
||||
)
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
|
||||
SchedulerConfig, SpeculativeConfig, VllmConfig)
|
||||
@@ -80,7 +81,10 @@ def make_output(scheduler):
|
||||
req.request_id: i
|
||||
for i, req in enumerate(scheduler.running)
|
||||
}
|
||||
sampled_token_ids = [[1000]] * len(scheduler.running)
|
||||
sampled_token_ids = [
|
||||
np.array([1000], dtype=np.int64) for _ in scheduler.running
|
||||
]
|
||||
|
||||
logprobs = None
|
||||
|
||||
modelrunner_output = ModelRunnerOutput(
|
||||
@@ -361,16 +365,15 @@ class TestAscendScheduler(TestBase):
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_mm_hashes=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
free_encoder_mm_hashes=[])
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[[EOS_TOKEN_ID], [10, 11]
|
||||
sampled_token_ids=[np.array([EOS_TOKEN_ID]),
|
||||
np.array([10, 11])
|
||||
], # First request hits EOS, second continues
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@@ -414,17 +417,16 @@ class TestAscendScheduler(TestBase):
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_mm_hashes=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
free_encoder_mm_hashes=[])
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[[10, 42, 12],
|
||||
[13, 14]], # First request hits stop token
|
||||
sampled_token_ids=[np.array([10, 42, 12]),
|
||||
np.array([13, 14])
|
||||
], # First request hits stop token
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -466,17 +468,16 @@ class TestAscendScheduler(TestBase):
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_mm_hashes=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
free_encoder_mm_hashes=[])
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[[10, 11, 12],
|
||||
[13]], # First request exceeds max_tokens
|
||||
sampled_token_ids=[np.array([10, 11, 12]),
|
||||
np.array([13])
|
||||
], # First request exceeds max_tokens
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -511,13 +512,11 @@ class TestAscendScheduler(TestBase):
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_mm_hashes=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
free_encoder_mm_hashes=[])
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
|
||||
sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -574,7 +573,7 @@ class TestAscendScheduler(TestBase):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[[0]],
|
||||
sampled_token_ids=[np.array([0], dtype=np.int64)],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -590,7 +589,7 @@ class TestAscendScheduler(TestBase):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[1].request_id],
|
||||
req_id_to_index={requests[1].request_id: 0},
|
||||
sampled_token_ids=[[0]],
|
||||
sampled_token_ids=[np.array([0], dtype=np.int64)],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -608,10 +607,12 @@ class TestAscendScheduler(TestBase):
|
||||
spec_tokens_list: List[List[List[int]]] = [[[1, 2, 3]], [[1, 2, 3]],
|
||||
[[1, 2], [3]], [[1]], [[]],
|
||||
[[1, 2, 3], [4, 5, 6]]]
|
||||
output_tokens_list: List[List[List[int]]] = [[[1, 2, 3, 4]], [[1, 5]],
|
||||
[[1, 2, 5], [3, 4]],
|
||||
[[1, 2]], [[5]],
|
||||
[[1, 2, 7], [4, 8]]]
|
||||
output_tokens_list: List[List[List[int]]] = [
|
||||
[np.array([1, 2, 3, 4])], [np.array([1, 5])],
|
||||
[np.array([1, 2, 5]), np.array([3, 4])], [np.array([1, 2])],
|
||||
[np.array([5])], [np.array([1, 2, 7]),
|
||||
np.array([4, 8])]
|
||||
]
|
||||
expected_list: List[Tuple[int, int,
|
||||
int, List[int]]] = [(1, 3, 3, [1, 1, 1]),
|
||||
(1, 3, 1, [1, 0, 0]),
|
||||
@@ -649,7 +650,9 @@ class TestAscendScheduler(TestBase):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[[0] for _ in range(len(requests))],
|
||||
sampled_token_ids=[
|
||||
np.array([0]) for _ in range(len(requests))
|
||||
],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -1054,16 +1057,15 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_mm_hashes=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
free_encoder_mm_hashes=[])
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[[EOS_TOKEN_ID], [10, 11]
|
||||
sampled_token_ids=[np.array([EOS_TOKEN_ID]),
|
||||
np.array([10, 11])
|
||||
], # First request hits EOS, second continues
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@@ -1107,17 +1109,16 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_mm_hashes=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
free_encoder_mm_hashes=[])
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[[10, 42, 12],
|
||||
[13, 14]], # First request hits stop token
|
||||
sampled_token_ids=[np.array([10, 42, 12]),
|
||||
np.array([13, 14])
|
||||
], # First request hits stop token
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -1159,17 +1160,16 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_mm_hashes=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
free_encoder_mm_hashes=[])
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[[10, 11, 12],
|
||||
[13]], # First request exceeds max_tokens
|
||||
sampled_token_ids=[np.array([10, 11, 12]),
|
||||
np.array([13])
|
||||
], # First request exceeds max_tokens
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -1204,13 +1204,11 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_mm_hashes=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
free_encoder_mm_hashes=[])
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
|
||||
sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -1267,7 +1265,7 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[[0]],
|
||||
sampled_token_ids=[np.array([0])],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -1283,7 +1281,7 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[1].request_id],
|
||||
req_id_to_index={requests[1].request_id: 0},
|
||||
sampled_token_ids=[[0]],
|
||||
sampled_token_ids=[np.array([0])],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -1301,10 +1299,12 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
spec_tokens_list: List[List[List[int]]] = [[[1, 2, 3]], [[1, 2, 3]],
|
||||
[[1, 2], [3]], [[1]], [[]],
|
||||
[[1, 2, 3], [4, 5, 6]]]
|
||||
output_tokens_list: List[List[List[int]]] = [[[1, 2, 3, 4]], [[1, 5]],
|
||||
[[1, 2, 5], [3, 4]],
|
||||
[[1, 2]], [[5]],
|
||||
[[1, 2, 7], [4, 8]]]
|
||||
output_tokens_list: List[List[List[int]]] = [
|
||||
[np.array([1, 2, 3, 4])], [np.array([1, 5])],
|
||||
[np.array([1, 2, 5]), np.array([3, 4])], [np.array([1, 2])],
|
||||
[np.array([5])], [np.array([1, 2, 7]),
|
||||
np.array([4, 8])]
|
||||
]
|
||||
expected_list: List[Tuple[int, int,
|
||||
int, List[int]]] = [(1, 3, 3, [1, 1, 1]),
|
||||
(1, 3, 1, [1, 0, 0]),
|
||||
@@ -1342,7 +1342,9 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[[0] for _ in range(len(requests))],
|
||||
sampled_token_ids=[
|
||||
np.array([0]) for _ in range(len(requests))
|
||||
],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
import os
|
||||
from typing import Any, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from vllm import SamplingParams
|
||||
from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
|
||||
@@ -188,7 +189,7 @@ def create_model_runner_output(
|
||||
|
||||
# Make sampled tokens.
|
||||
sampled_token = EOS_TOKEN_ID if use_eos else 0
|
||||
sampled_token_ids = [[sampled_token] for _ in req_ids]
|
||||
sampled_token_ids = [np.array([sampled_token]) for _ in req_ids]
|
||||
|
||||
# Make output data structure.
|
||||
extra_args = {}
|
||||
|
||||
@@ -549,7 +549,6 @@ class TestNPUPlatform(TestBase):
|
||||
dtype="float16",
|
||||
kv_cache_dtype="float16",
|
||||
block_size=64,
|
||||
use_v1=True,
|
||||
#use_sfa=False,
|
||||
use_mla=True,
|
||||
)
|
||||
@@ -570,7 +569,6 @@ class TestNPUPlatform(TestBase):
|
||||
dtype="float16",
|
||||
kv_cache_dtype="float16",
|
||||
block_size=64,
|
||||
use_v1=True,
|
||||
#use_sfa=False,
|
||||
use_mla=True,
|
||||
)
|
||||
@@ -592,7 +590,6 @@ class TestNPUPlatform(TestBase):
|
||||
dtype="float16",
|
||||
kv_cache_dtype="float16",
|
||||
block_size=64,
|
||||
use_v1=True,
|
||||
#use_sfa=False,
|
||||
use_mla=False,
|
||||
)
|
||||
@@ -614,7 +611,6 @@ class TestNPUPlatform(TestBase):
|
||||
dtype="float16",
|
||||
kv_cache_dtype="float16",
|
||||
block_size=64,
|
||||
use_v1=True,
|
||||
#use_sfa=False,
|
||||
use_mla=False,
|
||||
)
|
||||
|
||||
@@ -57,6 +57,8 @@ class TestNPUTorchairWorker(TestBase):
|
||||
worker.model_config = MagicMock()
|
||||
worker.model_config.seed = 42
|
||||
worker.vllm_config = MagicMock()
|
||||
worker.parallel_config = MagicMock()
|
||||
worker.parallel_config.local_world_size = 0
|
||||
|
||||
result = worker._init_device()
|
||||
|
||||
@@ -89,6 +91,8 @@ class TestNPUTorchairWorker(TestBase):
|
||||
worker.model_config = MagicMock()
|
||||
worker.model_config.seed = 42
|
||||
worker.vllm_config = MagicMock()
|
||||
worker.parallel_config = MagicMock()
|
||||
worker.parallel_config.local_world_size = 0
|
||||
|
||||
result = worker._init_device()
|
||||
|
||||
|
||||
@@ -327,6 +327,8 @@ class TestNPUWorker(TestBase):
|
||||
worker = NPUWorker()
|
||||
worker.local_rank = 1
|
||||
worker.model_config = MagicMock()
|
||||
worker.parallel_config = MagicMock()
|
||||
worker.parallel_config.local_world_size = 0
|
||||
worker.model_config.seed = 42
|
||||
|
||||
# Test _init_device
|
||||
|
||||
Reference in New Issue
Block a user