[CI] Drop ascend scheduler from test (#4613)
Drop ascend scheduler from test - vLLM version: v0.11.2 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
1
.github/workflows/_e2e_test.yaml
vendored
1
.github/workflows/_e2e_test.yaml
vendored
@@ -91,7 +91,6 @@ jobs:
|
|||||||
pytest -sv tests/e2e/singlecard/test_completion_with_prompt_embeds.py
|
pytest -sv tests/e2e/singlecard/test_completion_with_prompt_embeds.py
|
||||||
pytest -sv tests/e2e/singlecard/test_aclgraph.py
|
pytest -sv tests/e2e/singlecard/test_aclgraph.py
|
||||||
pytest -sv tests/e2e/singlecard/test_aclgraph_mem.py
|
pytest -sv tests/e2e/singlecard/test_aclgraph_mem.py
|
||||||
pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
|
|
||||||
pytest -sv tests/e2e/singlecard/test_bge_model.py
|
pytest -sv tests/e2e/singlecard/test_bge_model.py
|
||||||
pytest -sv tests/e2e/singlecard/test_camem.py
|
pytest -sv tests/e2e/singlecard/test_camem.py
|
||||||
pytest -sv tests/e2e/singlecard/test_embedding.py
|
pytest -sv tests/e2e/singlecard/test_embedding.py
|
||||||
|
|||||||
@@ -1,118 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
import pytest
|
|
||||||
from vllm import SamplingParams
|
|
||||||
|
|
||||||
from tests.e2e.conftest import VllmRunner
|
|
||||||
from tests.e2e.model_utils import check_outputs_equal
|
|
||||||
|
|
||||||
MODEL = "Qwen/Qwen3-0.6B"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
|
||||||
def test_concurrent_partial_prefill(enforce_eager):
|
|
||||||
with VllmRunner(MODEL,
|
|
||||||
max_num_seqs=3,
|
|
||||||
max_num_batched_tokens=8192,
|
|
||||||
enforce_eager=enforce_eager,
|
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
|
||||||
outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
|
|
||||||
3)
|
|
||||||
assert len(outputs) == 3
|
|
||||||
for output in outputs:
|
|
||||||
assert len(output.outputs) == 1
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
|
||||||
def test_prefix_cache_stats_is_recorded(enforce_eager):
|
|
||||||
with VllmRunner(MODEL,
|
|
||||||
max_num_seqs=3,
|
|
||||||
max_num_batched_tokens=8192,
|
|
||||||
enforce_eager=enforce_eager,
|
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
|
||||||
# 17 tokens will make sure first 16 tokens are cached in a block
|
|
||||||
input_tokens = {"prompt_token_ids": [101] * 129}
|
|
||||||
_ = vllm_model.model.generate([input_tokens])
|
|
||||||
outputs = vllm_model.model.generate([input_tokens])
|
|
||||||
assert outputs[0].num_cached_tokens == 128
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("max_tokens",
|
|
||||||
[4]) # cannot align results when max_tokens > 4
|
|
||||||
@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
|
|
||||||
def test_chunked_prefill_with_scheduler_dynamic_batch(
|
|
||||||
max_tokens: int, chunked_prefill_token_size: int) -> None:
|
|
||||||
example_prompts = [
|
|
||||||
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
|
|
||||||
]
|
|
||||||
max_num_seqs = chunked_prefill_token_size
|
|
||||||
max_num_batched_tokens = chunked_prefill_token_size
|
|
||||||
with VllmRunner(MODEL,
|
|
||||||
additional_config={
|
|
||||||
'SLO_limits_for_dynamic_batch': 0,
|
|
||||||
},
|
|
||||||
max_num_seqs=max_num_seqs,
|
|
||||||
max_num_batched_tokens=max_num_batched_tokens,
|
|
||||||
max_model_len=2048,
|
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
|
||||||
dynamic_batch_output = vllm_model.generate_greedy(
|
|
||||||
example_prompts, max_tokens)
|
|
||||||
|
|
||||||
with VllmRunner(MODEL,
|
|
||||||
additional_config={
|
|
||||||
'SLO_limits_for_dynamic_batch': -1,
|
|
||||||
},
|
|
||||||
max_model_len=2048,
|
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
|
||||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
|
||||||
|
|
||||||
check_outputs_equal(
|
|
||||||
outputs_0_lst=vllm_output,
|
|
||||||
outputs_1_lst=dynamic_batch_output,
|
|
||||||
name_0="vllm_output",
|
|
||||||
name_1="chunked_prefill_output",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_async_scheduling_eager() -> None:
|
|
||||||
prompts = [
|
|
||||||
"Hello, my name is",
|
|
||||||
"The president of the United States is",
|
|
||||||
"The capital of France is",
|
|
||||||
"The future of AI is",
|
|
||||||
] * 10
|
|
||||||
sampling_params = SamplingParams(temperature=0.2,
|
|
||||||
max_tokens=10,
|
|
||||||
stop_token_ids=None)
|
|
||||||
|
|
||||||
with VllmRunner(
|
|
||||||
"Qwen/Qwen2.5-0.5B-Instruct",
|
|
||||||
max_model_len=4096,
|
|
||||||
max_num_seqs=50,
|
|
||||||
dtype="bfloat16",
|
|
||||||
gpu_memory_utilization=0.9,
|
|
||||||
async_scheduling=True,
|
|
||||||
) as vllm_model:
|
|
||||||
vllm_model.generate(prompts, sampling_params=sampling_params)
|
|
||||||
|
|
||||||
|
|
||||||
def test_async_scheduling_with_full_graph() -> None:
|
|
||||||
prompts = [
|
|
||||||
"Hello, my name is",
|
|
||||||
"The president of the United States is",
|
|
||||||
"The capital of France is",
|
|
||||||
"The future of AI is",
|
|
||||||
] * 10
|
|
||||||
sampling_params = SamplingParams(temperature=0.2,
|
|
||||||
max_tokens=10,
|
|
||||||
stop_token_ids=None)
|
|
||||||
|
|
||||||
with VllmRunner("Qwen/Qwen3-8B",
|
|
||||||
max_model_len=4096,
|
|
||||||
max_num_seqs=50,
|
|
||||||
dtype="bfloat16",
|
|
||||||
gpu_memory_utilization=0.9,
|
|
||||||
async_scheduling=True,
|
|
||||||
compilation_config={"cudagraph_mode":
|
|
||||||
"FULL"}) as vllm_model:
|
|
||||||
vllm_model.generate(prompts, sampling_params=sampling_params)
|
|
||||||
@@ -33,13 +33,6 @@ class TestAscendW8A8FusedMoEMethod(TestBase):
|
|||||||
mock_get_ep_group.return_value = mock_ep_group
|
mock_get_ep_group.return_value = mock_ep_group
|
||||||
mock_ascend_config = Mock()
|
mock_ascend_config = Mock()
|
||||||
|
|
||||||
# 创建一个具有具体属性的 Mock 对象来表示 ascend_scheduler_config
|
|
||||||
mock_ascend_scheduler_config = Mock()
|
|
||||||
mock_ascend_scheduler_config.enabled = False
|
|
||||||
mock_ascend_scheduler_config.max_num_batched_tokens = 1024
|
|
||||||
mock_ascend_scheduler_config.max_model_len = 2048
|
|
||||||
mock_ascend_config.ascend_scheduler_config = mock_ascend_scheduler_config
|
|
||||||
|
|
||||||
mock_ascend_config.torchair_graph_config = Mock(enabled=False)
|
mock_ascend_config.torchair_graph_config = Mock(enabled=False)
|
||||||
mock_ascend_config.enable_chunked_prefill = False
|
mock_ascend_config.enable_chunked_prefill = False
|
||||||
mock_get_ascend_config.return_value = mock_ascend_config
|
mock_get_ascend_config.return_value = mock_ascend_config
|
||||||
|
|||||||
@@ -56,9 +56,6 @@ class TestAscendConfig(TestBase):
|
|||||||
self.assertTrue(torchair_graph_config.enable_frozen_parameter)
|
self.assertTrue(torchair_graph_config.enable_frozen_parameter)
|
||||||
self.assertFalse(torchair_graph_config.enable_kv_nz)
|
self.assertFalse(torchair_graph_config.enable_kv_nz)
|
||||||
|
|
||||||
ascend_scheduler_config = ascend_config.ascend_scheduler_config
|
|
||||||
self.assertFalse(ascend_scheduler_config.enabled)
|
|
||||||
|
|
||||||
@_clean_up_ascend_config
|
@_clean_up_ascend_config
|
||||||
def test_init_ascend_config_with_additional_config(self):
|
def test_init_ascend_config_with_additional_config(self):
|
||||||
test_vllm_config = VllmConfig()
|
test_vllm_config = VllmConfig()
|
||||||
@@ -74,9 +71,6 @@ class TestAscendConfig(TestBase):
|
|||||||
"enable_kv_nz": True
|
"enable_kv_nz": True
|
||||||
},
|
},
|
||||||
"multistream_overlap_shared_expert": True,
|
"multistream_overlap_shared_expert": True,
|
||||||
"ascend_scheduler_config": {
|
|
||||||
"enabled": True
|
|
||||||
},
|
|
||||||
"expert_map_path": "test_expert_map_path",
|
"expert_map_path": "test_expert_map_path",
|
||||||
"refresh": True,
|
"refresh": True,
|
||||||
}
|
}
|
||||||
@@ -94,9 +88,6 @@ class TestAscendConfig(TestBase):
|
|||||||
self.assertTrue(torchair_graph_config.enable_frozen_parameter)
|
self.assertTrue(torchair_graph_config.enable_frozen_parameter)
|
||||||
self.assertTrue(torchair_graph_config.enable_kv_nz)
|
self.assertTrue(torchair_graph_config.enable_kv_nz)
|
||||||
|
|
||||||
ascend_scheduler_config = ascend_config.ascend_scheduler_config
|
|
||||||
self.assertTrue(ascend_scheduler_config.enabled)
|
|
||||||
|
|
||||||
@_clean_up_ascend_config
|
@_clean_up_ascend_config
|
||||||
def test_init_ascend_config_with_refresh(self):
|
def test_init_ascend_config_with_refresh(self):
|
||||||
test_vllm_config = VllmConfig()
|
test_vllm_config = VllmConfig()
|
||||||
|
|||||||
@@ -522,31 +522,6 @@ class TestNPUPlatform(TestBase):
|
|||||||
self.platform.check_and_update_config(vllm_config)
|
self.platform.check_and_update_config(vllm_config)
|
||||||
self.assertEqual(vllm_config.compilation_config.custom_ops, [])
|
self.assertEqual(vllm_config.compilation_config.custom_ops, [])
|
||||||
|
|
||||||
@patch('vllm_ascend.utils.get_ascend_device_type',
|
|
||||||
return_value=AscendDeviceType._910_93)
|
|
||||||
@patch("vllm_ascend.ascend_config.check_ascend_config")
|
|
||||||
@patch("vllm_ascend.ascend_config.init_ascend_config")
|
|
||||||
@patch(
|
|
||||||
"vllm_ascend.core.recompute_schedule_config.RecomputeSchedulerConfig.initialize_from_config"
|
|
||||||
)
|
|
||||||
def test_check_and_update_config_ascend_scheduler_config(
|
|
||||||
self, mock_init_recompute, mock_init_ascend, mock_check_ascend,
|
|
||||||
mock_soc_version):
|
|
||||||
mock_ascend_config = TestNPUPlatform.mock_vllm_ascend_config()
|
|
||||||
mock_ascend_config.ascend_scheduler_config.enabled = True
|
|
||||||
mock_init_ascend.return_value = mock_ascend_config
|
|
||||||
vllm_config = TestNPUPlatform.mock_vllm_config()
|
|
||||||
vllm_config.parallel_config.tensor_parallel_size = 1
|
|
||||||
mock_init_recompute.return_value = MagicMock()
|
|
||||||
|
|
||||||
with patch("vllm_ascend.core.schedule_config.AscendSchedulerConfig"
|
|
||||||
) as mock_scheduler:
|
|
||||||
from vllm_ascend import platform
|
|
||||||
|
|
||||||
importlib.reload(platform)
|
|
||||||
self.platform.check_and_update_config(vllm_config)
|
|
||||||
mock_scheduler.initialize_from_config.assert_called_once()
|
|
||||||
|
|
||||||
@patch('vllm_ascend.platform.get_ascend_config')
|
@patch('vllm_ascend.platform.get_ascend_config')
|
||||||
def test_get_attn_backend_cls_use_v1_and_mla(self, mock_get_ascend_config):
|
def test_get_attn_backend_cls_use_v1_and_mla(self, mock_get_ascend_config):
|
||||||
mock_config = MagicMock()
|
mock_config = MagicMock()
|
||||||
|
|||||||
@@ -253,12 +253,10 @@ class TestUtils(TestBase):
|
|||||||
model_path = os.path.join(os.path.dirname(__file__), "fake_weight")
|
model_path = os.path.join(os.path.dirname(__file__), "fake_weight")
|
||||||
test_model_config = ModelConfig(model=model_path, enforce_eager=True)
|
test_model_config = ModelConfig(model=model_path, enforce_eager=True)
|
||||||
test_parallel_config = ParallelConfig()
|
test_parallel_config = ParallelConfig()
|
||||||
ascend_config = {"ascend_scheduler_config": {"enabled": False}}
|
|
||||||
test_vllm_config = VllmConfig(
|
test_vllm_config = VllmConfig(
|
||||||
model_config=test_model_config,
|
model_config=test_model_config,
|
||||||
compilation_config=test_compilation_config,
|
compilation_config=test_compilation_config,
|
||||||
parallel_config=test_parallel_config,
|
parallel_config=test_parallel_config)
|
||||||
additional_config=ascend_config)
|
|
||||||
utils.update_aclgraph_sizes(test_vllm_config)
|
utils.update_aclgraph_sizes(test_vllm_config)
|
||||||
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
|
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
|
||||||
utils.update_aclgraph_sizes(test_vllm_config)
|
utils.update_aclgraph_sizes(test_vllm_config)
|
||||||
|
|||||||
@@ -235,8 +235,6 @@ def test_torchair_deepseek_v2_mlp(mock_distributed, base_config):
|
|||||||
hidden_act="silu",
|
hidden_act="silu",
|
||||||
quant_config=None)
|
quant_config=None)
|
||||||
assert isinstance(mlp.act_fn, TorchairDeepseekV2SiluAndMul)
|
assert isinstance(mlp.act_fn, TorchairDeepseekV2SiluAndMul)
|
||||||
ascend_config = MagicMock()
|
|
||||||
ascend_config._ASCEND_CONFIG.ascend_scheduler_config.enabled = False
|
|
||||||
with patch(
|
with patch(
|
||||||
"vllm_ascend.torchair.models.torchair_deepseek_v2.QuantizationConfig"
|
"vllm_ascend.torchair.models.torchair_deepseek_v2.QuantizationConfig"
|
||||||
) as mock_quant_config:
|
) as mock_quant_config:
|
||||||
|
|||||||
Reference in New Issue
Block a user