diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 217d632c..4f5e6bc7 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -91,7 +91,6 @@ jobs: pytest -sv tests/e2e/singlecard/test_completion_with_prompt_embeds.py pytest -sv tests/e2e/singlecard/test_aclgraph.py pytest -sv tests/e2e/singlecard/test_aclgraph_mem.py - pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py pytest -sv tests/e2e/singlecard/test_bge_model.py pytest -sv tests/e2e/singlecard/test_camem.py pytest -sv tests/e2e/singlecard/test_embedding.py diff --git a/tests/e2e/singlecard/test_ascend_scheduler.py b/tests/e2e/singlecard/test_ascend_scheduler.py deleted file mode 100644 index 0c996e4e..00000000 --- a/tests/e2e/singlecard/test_ascend_scheduler.py +++ /dev/null @@ -1,118 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest -from vllm import SamplingParams - -from tests.e2e.conftest import VllmRunner -from tests.e2e.model_utils import check_outputs_equal - -MODEL = "Qwen/Qwen3-0.6B" - - -@pytest.mark.parametrize("enforce_eager", [True, False]) -def test_concurrent_partial_prefill(enforce_eager): - with VllmRunner(MODEL, - max_num_seqs=3, - max_num_batched_tokens=8192, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7) as vllm_model: - outputs = vllm_model.model.generate(["Hello my name is Robert and I"] * - 3) - assert len(outputs) == 3 - for output in outputs: - assert len(output.outputs) == 1 - - -@pytest.mark.parametrize("enforce_eager", [True, False]) -def test_prefix_cache_stats_is_recorded(enforce_eager): - with VllmRunner(MODEL, - max_num_seqs=3, - max_num_batched_tokens=8192, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7) as vllm_model: - # 17 tokens will make sure first 16 tokens are cached in a block - input_tokens = {"prompt_token_ids": [101] * 129} - _ = vllm_model.model.generate([input_tokens]) - outputs = vllm_model.model.generate([input_tokens]) - assert outputs[0].num_cached_tokens == 128 - - -@pytest.mark.parametrize("max_tokens", - [4]) # cannot align results when max_tokens > 4 -@pytest.mark.parametrize("chunked_prefill_token_size", [2048]) -def test_chunked_prefill_with_scheduler_dynamic_batch( - max_tokens: int, chunked_prefill_token_size: int) -> None: - example_prompts = [ - "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs." - ] - max_num_seqs = chunked_prefill_token_size - max_num_batched_tokens = chunked_prefill_token_size - with VllmRunner(MODEL, - additional_config={ - 'SLO_limits_for_dynamic_batch': 0, - }, - max_num_seqs=max_num_seqs, - max_num_batched_tokens=max_num_batched_tokens, - max_model_len=2048, - gpu_memory_utilization=0.7) as vllm_model: - dynamic_batch_output = vllm_model.generate_greedy( - example_prompts, max_tokens) - - with VllmRunner(MODEL, - additional_config={ - 'SLO_limits_for_dynamic_batch': -1, - }, - max_model_len=2048, - gpu_memory_utilization=0.7) as vllm_model: - vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=vllm_output, - outputs_1_lst=dynamic_batch_output, - name_0="vllm_output", - name_1="chunked_prefill_output", - ) - - -def test_async_scheduling_eager() -> None: - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] * 10 - sampling_params = SamplingParams(temperature=0.2, - max_tokens=10, - stop_token_ids=None) - - with VllmRunner( - "Qwen/Qwen2.5-0.5B-Instruct", - max_model_len=4096, - max_num_seqs=50, - dtype="bfloat16", - gpu_memory_utilization=0.9, - async_scheduling=True, - ) as vllm_model: - vllm_model.generate(prompts, sampling_params=sampling_params) - - -def test_async_scheduling_with_full_graph() -> None: - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] * 10 - sampling_params = SamplingParams(temperature=0.2, - max_tokens=10, - stop_token_ids=None) - - with VllmRunner("Qwen/Qwen3-8B", - max_model_len=4096, - max_num_seqs=50, - dtype="bfloat16", - gpu_memory_utilization=0.9, - async_scheduling=True, - compilation_config={"cudagraph_mode": - "FULL"}) as vllm_model: - vllm_model.generate(prompts, sampling_params=sampling_params) diff --git a/tests/ut/quantization/test_w8a8_dynamic.py b/tests/ut/quantization/test_w8a8_dynamic.py index f25192c2..76d510dd 100644 --- a/tests/ut/quantization/test_w8a8_dynamic.py +++ b/tests/ut/quantization/test_w8a8_dynamic.py @@ -33,13 +33,6 @@ class TestAscendW8A8FusedMoEMethod(TestBase): mock_get_ep_group.return_value = mock_ep_group mock_ascend_config = Mock() - # 创建一个具有具体属性的 Mock 对象来表示 ascend_scheduler_config - mock_ascend_scheduler_config = Mock() - mock_ascend_scheduler_config.enabled = False - mock_ascend_scheduler_config.max_num_batched_tokens = 1024 - mock_ascend_scheduler_config.max_model_len = 2048 - mock_ascend_config.ascend_scheduler_config = mock_ascend_scheduler_config - mock_ascend_config.torchair_graph_config = Mock(enabled=False) mock_ascend_config.enable_chunked_prefill = False mock_get_ascend_config.return_value = mock_ascend_config diff --git a/tests/ut/test_ascend_config.py b/tests/ut/test_ascend_config.py index 718bc85f..be066179 100644 --- a/tests/ut/test_ascend_config.py +++ b/tests/ut/test_ascend_config.py @@ -56,9 +56,6 @@ class TestAscendConfig(TestBase): self.assertTrue(torchair_graph_config.enable_frozen_parameter) self.assertFalse(torchair_graph_config.enable_kv_nz) - ascend_scheduler_config = ascend_config.ascend_scheduler_config - self.assertFalse(ascend_scheduler_config.enabled) - @_clean_up_ascend_config def test_init_ascend_config_with_additional_config(self): test_vllm_config = VllmConfig() @@ -74,9 +71,6 @@ class TestAscendConfig(TestBase): "enable_kv_nz": True }, "multistream_overlap_shared_expert": True, - "ascend_scheduler_config": { - "enabled": True - }, "expert_map_path": "test_expert_map_path", "refresh": True, } @@ -94,9 +88,6 @@ class TestAscendConfig(TestBase): self.assertTrue(torchair_graph_config.enable_frozen_parameter) self.assertTrue(torchair_graph_config.enable_kv_nz) - ascend_scheduler_config = ascend_config.ascend_scheduler_config - self.assertTrue(ascend_scheduler_config.enabled) - @_clean_up_ascend_config def test_init_ascend_config_with_refresh(self): test_vllm_config = VllmConfig() diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index 5fe5cde3..5dedff7f 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -522,31 +522,6 @@ class TestNPUPlatform(TestBase): self.platform.check_and_update_config(vllm_config) self.assertEqual(vllm_config.compilation_config.custom_ops, []) - @patch('vllm_ascend.utils.get_ascend_device_type', - return_value=AscendDeviceType._910_93) - @patch("vllm_ascend.ascend_config.check_ascend_config") - @patch("vllm_ascend.ascend_config.init_ascend_config") - @patch( - "vllm_ascend.core.recompute_schedule_config.RecomputeSchedulerConfig.initialize_from_config" - ) - def test_check_and_update_config_ascend_scheduler_config( - self, mock_init_recompute, mock_init_ascend, mock_check_ascend, - mock_soc_version): - mock_ascend_config = TestNPUPlatform.mock_vllm_ascend_config() - mock_ascend_config.ascend_scheduler_config.enabled = True - mock_init_ascend.return_value = mock_ascend_config - vllm_config = TestNPUPlatform.mock_vllm_config() - vllm_config.parallel_config.tensor_parallel_size = 1 - mock_init_recompute.return_value = MagicMock() - - with patch("vllm_ascend.core.schedule_config.AscendSchedulerConfig" - ) as mock_scheduler: - from vllm_ascend import platform - - importlib.reload(platform) - self.platform.check_and_update_config(vllm_config) - mock_scheduler.initialize_from_config.assert_called_once() - @patch('vllm_ascend.platform.get_ascend_config') def test_get_attn_backend_cls_use_v1_and_mla(self, mock_get_ascend_config): mock_config = MagicMock() diff --git a/tests/ut/test_utils.py b/tests/ut/test_utils.py index 29ed7b44..8ff1419e 100644 --- a/tests/ut/test_utils.py +++ b/tests/ut/test_utils.py @@ -253,12 +253,10 @@ class TestUtils(TestBase): model_path = os.path.join(os.path.dirname(__file__), "fake_weight") test_model_config = ModelConfig(model=model_path, enforce_eager=True) test_parallel_config = ParallelConfig() - ascend_config = {"ascend_scheduler_config": {"enabled": False}} test_vllm_config = VllmConfig( model_config=test_model_config, compilation_config=test_compilation_config, - parallel_config=test_parallel_config, - additional_config=ascend_config) + parallel_config=test_parallel_config) utils.update_aclgraph_sizes(test_vllm_config) os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV' utils.update_aclgraph_sizes(test_vllm_config) diff --git a/tests/ut/torchair/models/test_torchair_deepseek_v2.py b/tests/ut/torchair/models/test_torchair_deepseek_v2.py index 35e1bb99..e1a5625b 100644 --- a/tests/ut/torchair/models/test_torchair_deepseek_v2.py +++ b/tests/ut/torchair/models/test_torchair_deepseek_v2.py @@ -235,8 +235,6 @@ def test_torchair_deepseek_v2_mlp(mock_distributed, base_config): hidden_act="silu", quant_config=None) assert isinstance(mlp.act_fn, TorchairDeepseekV2SiluAndMul) - ascend_config = MagicMock() - ascend_config._ASCEND_CONFIG.ascend_scheduler_config.enabled = False with patch( "vllm_ascend.torchair.models.torchair_deepseek_v2.QuantizationConfig" ) as mock_quant_config: