upgrade to vllm 0.11.2 (#4400)

Bump vLLM version to v0.11.2

What's broken and changed by vLLM:
1. structured_output is broken by
https://github.com/vllm-project/vllm/pull/26866
2. get_mrope_input_positions is broken by
https://github.com/vllm-project/vllm/pull/28399
3. graph mode is broken by
https://github.com/vllm-project/vllm/pull/25110 we'll upgrade torch to
2.8 to fix the problem later
4. embedding is broken by
https://github.com/vllm-project/vllm/pull/27583
5. `get_attn_backend_cls` and attention backend is broken are broken by
https://github.com/vllm-project/vllm/pull/28534
6. spec decode is broken by
https://github.com/vllm-project/vllm/pull/28771
7. sp feature is broken by
https://github.com/vllm-project/vllm/pull/27126
8. mtp is broken by https://github.com/vllm-project/vllm/pull/27922
9. lora is broken by https://github.com/vllm-project/vllm/pull/21068
10. execute_model is broken by
https://github.com/vllm-project/vllm/pull/26866
11. `VLLM_DISABLE_SHARED_EXPERTS_STREAM` env is broken by
https://github.com/vllm-project/vllm/pull/28159
12. kv cahe is broken by https://github.com/vllm-project/vllm/pull/27753
13. dp is broken by https://github.com/vllm-project/vllm/pull/25110

 
What's broken and changed by ourself:
1. qwen vl is broken by https://github.com/vllm-project/vllm/pull/28455
We'll remove model files in the future to avoid this kind of error
2. Engine core is broken by
https://github.com/vllm-project/vllm/pull/23691 We'll remove the patch
file in the future.
3. Ascend scheduler is broken by
https://github.com/vllm-project/vllm/pull/28733 We'll remove ascend
scheudler later.
4. qwen3-next is broken by
https://github.com/vllm-project/vllm/pull/28083 We'll remove model files
in the future to avoid this kind of error
5. qwen vl is broken by https://github.com/vllm-project/vllm/pull/27764.
We'll remove model files in the future

Known issue:
1. ray doesn't work 
2. the accuracy of qwen3-next is not correct
3. qwen3-vl is broken
4. prefix cache+ ascend scheduler + deepseek v2 lite is broken.

Co-authored-by: MengqingCao <cmq0113@163.com>
Co-authored-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: leo-pony <nengjunma@outlook.com>
Co-authored-by: 22dimensions <waitingwind@foxmail.com>
Co-authored-by: shen-shanshan <467638484@qq.com>


- vLLM version: v0.11.2

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Signed-off-by: leo-pony <nengjunma@outlook.com>
Co-authored-by: MengqingCao <cmq0113@163.com>
Co-authored-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: leo-pony <nengjunma@outlook.com>
This commit is contained in:
wangxiyuan
2025-11-26 11:48:58 +08:00
committed by GitHub
parent d5f77f14d0
commit bc69d7cfe1
54 changed files with 744 additions and 437 deletions

View File

@@ -14,7 +14,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
class TestAscendAttentionBackend(TestBase):
def test_get_name(self):
self.assertEqual(AscendAttentionBackend.get_name(), "ASCEND")
self.assertEqual(AscendAttentionBackend.get_name(), "CUSTOM")
def test_get_impl_cls(self):
self.assertEqual(AscendAttentionBackend.get_impl_cls(),

View File

@@ -107,8 +107,7 @@ class TestACLGraphWrapper(TestBase):
wrapper = ACLGraphWrapper(runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool)
runtime_mode=CUDAGraphMode.FULL)
self.assertEqual(wrapper.runnable, self.mock_runnable)
self.assertEqual(wrapper.vllm_config, self.mock_vllm_config)
@@ -130,7 +129,6 @@ class TestACLGraphWrapper(TestBase):
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
self.assertEqual(wrapper.runnable, self.mock_runnable)
@@ -152,8 +150,7 @@ class TestACLGraphWrapper(TestBase):
with self.assertRaises(AssertionError):
ACLGraphWrapper(runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.NONE,
graph_pool=self.mock_graph_pool)
runtime_mode=CUDAGraphMode.NONE)
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
@patch('vllm_ascend.compilation.acl_graph.current_platform')
@@ -171,7 +168,6 @@ class TestACLGraphWrapper(TestBase):
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
result = wrapper("arg1", "arg2")
@@ -196,7 +192,6 @@ class TestACLGraphWrapper(TestBase):
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
result = wrapper("arg1", "arg2")
@@ -247,7 +242,6 @@ class TestACLGraphWrapper(TestBase):
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
# Create a real torch tensor for the test, not a mock
@@ -319,7 +313,6 @@ class TestACLGraphWrapper(TestBase):
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
# Create a real torch tensor for the test, not a mock
@@ -392,7 +385,6 @@ class TestACLGraphWrapper(TestBase):
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
# First call to capture the graph
@@ -447,7 +439,6 @@ class TestACLGraphWrapper(TestBase):
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
# First call to capture the graph
@@ -518,7 +509,6 @@ class TestACLGraphWrapper(TestBase):
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
# Create a real torch tensor for the test, not a mock
@@ -588,7 +578,6 @@ class TestACLGraphWrapper(TestBase):
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
# Create a real torch tensor for the test, not a mock
@@ -659,7 +648,6 @@ class TestACLGraphWrapper(TestBase):
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
# Create a real torch tensor for the test, not a mock
@@ -680,7 +668,6 @@ class TestACLGraphWrapper(TestBase):
runnable=mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
# Should be able to access attributes of the runnable
@@ -699,7 +686,6 @@ class TestACLGraphWrapper(TestBase):
runnable=mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
# Should raise AttributeError for non-existent attributes
@@ -715,7 +701,6 @@ class TestACLGraphWrapper(TestBase):
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
unwrapped = wrapper.unwrap()

View File

@@ -48,7 +48,7 @@ class TestAscendSchedulerConfig(TestBase):
enable_chunked_prefill=False,
policy="fcfs",
scheduler_cls="vllm_ascend.core.scheduler.AscendScheduler",
max_num_batched_tokens=2048,
max_num_batched_tokens=8192,
max_model_len=2048,
max_long_partial_prefills=1,
long_prefill_token_threshold=512,
@@ -58,8 +58,8 @@ class TestAscendSchedulerConfig(TestBase):
self.assertEqual(ascend_config.policy, "fcfs")
self.assertEqual(ascend_config.scheduler_cls,
"vllm_ascend.core.scheduler.AscendScheduler")
self.assertEqual(ascend_config.max_num_batched_tokens, 2048)
self.assertEqual(ascend_config.encoder_cache_size, 2048)
self.assertEqual(ascend_config.max_num_batched_tokens, 8192)
self.assertEqual(ascend_config.encoder_cache_size, 8192)
self.assertEqual(ascend_config.max_long_partial_prefills, 1)
self.assertEqual(ascend_config.long_prefill_token_threshold, 512)
@@ -69,7 +69,7 @@ class TestAscendSchedulerConfig(TestBase):
self.basic_scheduler_config,
AscendSchedulerConfig(
policy="custom_policy",
max_num_batched_tokens=2048,
max_num_batched_tokens=8192,
max_model_len=2048,
),
)
@@ -86,7 +86,8 @@ class TestAscendSchedulerConfig(TestBase):
def test_valid_config_with_multimodal(self):
config = AscendSchedulerConfig.initialize_from_config(
SchedulerConfig(is_multimodal_model=True), {})
SchedulerConfig(is_multimodal_model=True,
max_num_batched_tokens=8192), {})
self.assertTrue(config.is_multimodal_model)
def test_valid_config_with_chunked_prefill(self):
@@ -94,12 +95,12 @@ class TestAscendSchedulerConfig(TestBase):
self.basic_scheduler_config,
AscendSchedulerConfig(
enable_chunked_prefill=True,
max_num_batched_tokens=2048,
max_model_len=4096,
max_num_batched_tokens=8192,
max_model_len=8192,
),
)
self.assertEqual(ascend_config.max_num_batched_tokens, 2048)
self.assertEqual(ascend_config.max_model_len, 4096)
self.assertEqual(ascend_config.max_num_batched_tokens, 8192)
self.assertEqual(ascend_config.max_model_len, 8192)
self.assertTrue(ascend_config.enable_chunked_prefill)
def test_invalid_config_without_chunked_prefill(self):
@@ -109,7 +110,7 @@ class TestAscendSchedulerConfig(TestBase):
AscendSchedulerConfig(
enable_chunked_prefill=False,
max_num_batched_tokens=2048,
max_model_len=4096,
max_model_len=8192,
),
)
self.assertIn(
@@ -117,7 +118,7 @@ class TestAscendSchedulerConfig(TestBase):
str(context.exception),
)
self.assertIn("max_num_batched_tokens (2048)", str(context.exception))
self.assertIn("max_model_len (4096)", str(context.exception))
self.assertIn("max_model_len (8192)", str(context.exception))
def test_initialize_from_config_with_pd_transfer(self):
ascend_config = AscendSchedulerConfig.initialize_from_config(
@@ -125,7 +126,7 @@ class TestAscendSchedulerConfig(TestBase):
AscendSchedulerConfig(
enable_pd_transfer=True,
decode_max_num_seqs=48,
max_num_batched_tokens=4096,
max_num_batched_tokens=8192,
max_model_len=4096,
),
)

View File

@@ -3,6 +3,7 @@
from typing import Any, Dict, List, Optional, Tuple
from unittest.mock import MagicMock, patch
import numpy as np
import torch
from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
SchedulerConfig, SpeculativeConfig, VllmConfig)
@@ -80,7 +81,10 @@ def make_output(scheduler):
req.request_id: i
for i, req in enumerate(scheduler.running)
}
sampled_token_ids = [[1000]] * len(scheduler.running)
sampled_token_ids = [
np.array([1000], dtype=np.int64) for _ in scheduler.running
]
logprobs = None
modelrunner_output = ModelRunnerOutput(
@@ -361,16 +365,15 @@ class TestAscendScheduler(TestBase):
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
free_encoder_mm_hashes=[])
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[EOS_TOKEN_ID], [10, 11]
sampled_token_ids=[np.array([EOS_TOKEN_ID]),
np.array([10, 11])
], # First request hits EOS, second continues
logprobs=None,
prompt_logprobs_dict={},
@@ -414,17 +417,16 @@ class TestAscendScheduler(TestBase):
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
free_encoder_mm_hashes=[])
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[10, 42, 12],
[13, 14]], # First request hits stop token
sampled_token_ids=[np.array([10, 42, 12]),
np.array([13, 14])
], # First request hits stop token
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
@@ -466,17 +468,16 @@ class TestAscendScheduler(TestBase):
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
free_encoder_mm_hashes=[])
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[10, 11, 12],
[13]], # First request exceeds max_tokens
sampled_token_ids=[np.array([10, 11, 12]),
np.array([13])
], # First request exceeds max_tokens
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
@@ -511,13 +512,11 @@ class TestAscendScheduler(TestBase):
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
free_encoder_mm_hashes=[])
model_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],
req_id_to_index={requests[0].request_id: 0},
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
@@ -574,7 +573,7 @@ class TestAscendScheduler(TestBase):
model_runner_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],
req_id_to_index={requests[0].request_id: 0},
sampled_token_ids=[[0]],
sampled_token_ids=[np.array([0], dtype=np.int64)],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
@@ -590,7 +589,7 @@ class TestAscendScheduler(TestBase):
model_runner_output = ModelRunnerOutput(
req_ids=[requests[1].request_id],
req_id_to_index={requests[1].request_id: 0},
sampled_token_ids=[[0]],
sampled_token_ids=[np.array([0], dtype=np.int64)],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
@@ -608,10 +607,12 @@ class TestAscendScheduler(TestBase):
spec_tokens_list: List[List[List[int]]] = [[[1, 2, 3]], [[1, 2, 3]],
[[1, 2], [3]], [[1]], [[]],
[[1, 2, 3], [4, 5, 6]]]
output_tokens_list: List[List[List[int]]] = [[[1, 2, 3, 4]], [[1, 5]],
[[1, 2, 5], [3, 4]],
[[1, 2]], [[5]],
[[1, 2, 7], [4, 8]]]
output_tokens_list: List[List[List[int]]] = [
[np.array([1, 2, 3, 4])], [np.array([1, 5])],
[np.array([1, 2, 5]), np.array([3, 4])], [np.array([1, 2])],
[np.array([5])], [np.array([1, 2, 7]),
np.array([4, 8])]
]
expected_list: List[Tuple[int, int,
int, List[int]]] = [(1, 3, 3, [1, 1, 1]),
(1, 3, 1, [1, 0, 0]),
@@ -649,7 +650,9 @@ class TestAscendScheduler(TestBase):
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
sampled_token_ids=[[0] for _ in range(len(requests))],
sampled_token_ids=[
np.array([0]) for _ in range(len(requests))
],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
@@ -1054,16 +1057,15 @@ class TestSchedulerDynamicBatch(TestBase):
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
free_encoder_mm_hashes=[])
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[EOS_TOKEN_ID], [10, 11]
sampled_token_ids=[np.array([EOS_TOKEN_ID]),
np.array([10, 11])
], # First request hits EOS, second continues
logprobs=None,
prompt_logprobs_dict={},
@@ -1107,17 +1109,16 @@ class TestSchedulerDynamicBatch(TestBase):
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
free_encoder_mm_hashes=[])
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[10, 42, 12],
[13, 14]], # First request hits stop token
sampled_token_ids=[np.array([10, 42, 12]),
np.array([13, 14])
], # First request hits stop token
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
@@ -1159,17 +1160,16 @@ class TestSchedulerDynamicBatch(TestBase):
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
free_encoder_mm_hashes=[])
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[10, 11, 12],
[13]], # First request exceeds max_tokens
sampled_token_ids=[np.array([10, 11, 12]),
np.array([13])
], # First request exceeds max_tokens
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
@@ -1204,13 +1204,11 @@ class TestSchedulerDynamicBatch(TestBase):
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
free_encoder_mm_hashes=[])
model_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],
req_id_to_index={requests[0].request_id: 0},
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
@@ -1267,7 +1265,7 @@ class TestSchedulerDynamicBatch(TestBase):
model_runner_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],
req_id_to_index={requests[0].request_id: 0},
sampled_token_ids=[[0]],
sampled_token_ids=[np.array([0])],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
@@ -1283,7 +1281,7 @@ class TestSchedulerDynamicBatch(TestBase):
model_runner_output = ModelRunnerOutput(
req_ids=[requests[1].request_id],
req_id_to_index={requests[1].request_id: 0},
sampled_token_ids=[[0]],
sampled_token_ids=[np.array([0])],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
@@ -1301,10 +1299,12 @@ class TestSchedulerDynamicBatch(TestBase):
spec_tokens_list: List[List[List[int]]] = [[[1, 2, 3]], [[1, 2, 3]],
[[1, 2], [3]], [[1]], [[]],
[[1, 2, 3], [4, 5, 6]]]
output_tokens_list: List[List[List[int]]] = [[[1, 2, 3, 4]], [[1, 5]],
[[1, 2, 5], [3, 4]],
[[1, 2]], [[5]],
[[1, 2, 7], [4, 8]]]
output_tokens_list: List[List[List[int]]] = [
[np.array([1, 2, 3, 4])], [np.array([1, 5])],
[np.array([1, 2, 5]), np.array([3, 4])], [np.array([1, 2])],
[np.array([5])], [np.array([1, 2, 7]),
np.array([4, 8])]
]
expected_list: List[Tuple[int, int,
int, List[int]]] = [(1, 3, 3, [1, 1, 1]),
(1, 3, 1, [1, 0, 0]),
@@ -1342,7 +1342,9 @@ class TestSchedulerDynamicBatch(TestBase):
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
sampled_token_ids=[[0] for _ in range(len(requests))],
sampled_token_ids=[
np.array([0]) for _ in range(len(requests))
],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])

View File

@@ -6,6 +6,7 @@
import os
from typing import Any, Optional
import numpy as np
import torch
from vllm import SamplingParams
from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
@@ -188,7 +189,7 @@ def create_model_runner_output(
# Make sampled tokens.
sampled_token = EOS_TOKEN_ID if use_eos else 0
sampled_token_ids = [[sampled_token] for _ in req_ids]
sampled_token_ids = [np.array([sampled_token]) for _ in req_ids]
# Make output data structure.
extra_args = {}

View File

@@ -549,7 +549,6 @@ class TestNPUPlatform(TestBase):
dtype="float16",
kv_cache_dtype="float16",
block_size=64,
use_v1=True,
#use_sfa=False,
use_mla=True,
)
@@ -570,7 +569,6 @@ class TestNPUPlatform(TestBase):
dtype="float16",
kv_cache_dtype="float16",
block_size=64,
use_v1=True,
#use_sfa=False,
use_mla=True,
)
@@ -592,7 +590,6 @@ class TestNPUPlatform(TestBase):
dtype="float16",
kv_cache_dtype="float16",
block_size=64,
use_v1=True,
#use_sfa=False,
use_mla=False,
)
@@ -614,7 +611,6 @@ class TestNPUPlatform(TestBase):
dtype="float16",
kv_cache_dtype="float16",
block_size=64,
use_v1=True,
#use_sfa=False,
use_mla=False,
)

View File

@@ -57,6 +57,8 @@ class TestNPUTorchairWorker(TestBase):
worker.model_config = MagicMock()
worker.model_config.seed = 42
worker.vllm_config = MagicMock()
worker.parallel_config = MagicMock()
worker.parallel_config.local_world_size = 0
result = worker._init_device()
@@ -89,6 +91,8 @@ class TestNPUTorchairWorker(TestBase):
worker.model_config = MagicMock()
worker.model_config.seed = 42
worker.vllm_config = MagicMock()
worker.parallel_config = MagicMock()
worker.parallel_config.local_world_size = 0
result = worker._init_device()

View File

@@ -327,6 +327,8 @@ class TestNPUWorker(TestBase):
worker = NPUWorker()
worker.local_rank = 1
worker.model_config = MagicMock()
worker.parallel_config = MagicMock()
worker.parallel_config.local_world_size = 0
worker.model_config.seed = 42
# Test _init_device