From f96100fad51b9b5eb6675e42a464115d560037c0 Mon Sep 17 00:00:00 2001 From: Shanshan Shen <467638484@qq.com> Date: Tue, 15 Jul 2025 19:58:55 +0800 Subject: [PATCH] [Misc][V0 Deprecation] Remove V0 related codes of test, example, platform (#1805) ### What this PR does / why we need it? Remove V0 related codes of test, example, platform. This PR is a part of https://github.com/vllm-project/vllm-ascend/issues/1620. - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/235bfd5dfe0975e42b115cfb910e73eff5c670d8 --------- Signed-off-by: shen-shanshan <467638484@qq.com> --- examples/offline_inference_npu_v0.py | 44 --- tests/e2e/singlecard/test_guided_decoding.py | 14 +- tests/ut/test_platform.py | 39 -- tests/ut/worker/test_pooling_model_runner.py | 355 ------------------- vllm_ascend/platform.py | 18 +- 5 files changed, 10 insertions(+), 460 deletions(-) delete mode 100644 examples/offline_inference_npu_v0.py delete mode 100644 tests/ut/worker/test_pooling_model_runner.py diff --git a/examples/offline_inference_npu_v0.py b/examples/offline_inference_npu_v0.py deleted file mode 100644 index b6a1156..0000000 --- a/examples/offline_inference_npu_v0.py +++ /dev/null @@ -1,44 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# Adapted from vllm-project/vllm/examples/offline_inference/basic.py -# - -import os - -os.environ["VLLM_USE_V1"] = "0" -os.environ["VLLM_USE_MODELSCOPE"] = "True" - -from vllm import LLM, SamplingParams - -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] - -# Create a sampling params object. -sampling_params = SamplingParams(max_tokens=100, temperature=0.0) -# Create an LLM. -llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct") - -# Generate texts from the prompts. -outputs = llm.generate(prompts, sampling_params) -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/tests/e2e/singlecard/test_guided_decoding.py b/tests/e2e/singlecard/test_guided_decoding.py index 20c03a5..04587d0 100644 --- a/tests/e2e/singlecard/test_guided_decoding.py +++ b/tests/e2e/singlecard/test_guided_decoding.py @@ -30,10 +30,7 @@ from tests.e2e.conftest import VllmRunner os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct" -GuidedDecodingBackendV0 = ["outlines", "lm-format-enforcer", "xgrammar"] -GuidedDecodingBackendV1 = ["xgrammar", "guidance"] -GuidedDecodingBackend = list( - set(GuidedDecodingBackendV0 + GuidedDecodingBackendV1)) +GuidedDecodingBackend = ["xgrammar", "guidance"] @pytest.fixture(scope="module") @@ -84,16 +81,9 @@ def sample_json_schema(): } -def check_backend(guided_decoding_backend: str): - if guided_decoding_backend not in GuidedDecodingBackendV1: - pytest.skip(f"{guided_decoding_backend} does not support v1, skip it.") - - @pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend) def test_guided_json_completion(guided_decoding_backend: str, sample_json_schema): - check_backend(guided_decoding_backend) - sampling_params = SamplingParams( temperature=1.0, max_tokens=500, @@ -130,8 +120,6 @@ def test_guided_json_completion(guided_decoding_backend: str, @pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend) def test_guided_regex(guided_decoding_backend: str, sample_regex): - check_backend(guided_decoding_backend) - sampling_params = SamplingParams( temperature=0.8, top_p=0.95, diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index b6bee95..f7dc40e 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -481,45 +481,6 @@ class TestNPUPlatform(TestBase): result, "vllm_ascend.attention.attention_v1.AscendAttentionBackend") - @patch('vllm_ascend.platform.get_ascend_config') - def test_get_attn_backend_cls_use_mla_only(self, mock_get_ascend_config): - mock_config = MagicMock() - mock_config.torchair_graph_config.enabled = False - - mock_get_ascend_config.return_value = mock_config - - result = self.platform.get_attn_backend_cls( - selected_backend="ascend", - head_size=64, - dtype="float16", - kv_cache_dtype="float16", - block_size=64, - use_v1=False, - use_mla=True, - ) - self.assertEqual( - result, - "vllm_ascend.attention.attention.AscendMLAAttentionBackend") - - @patch('vllm_ascend.platform.get_ascend_config') - def test_get_attn_backend_cls_default_case(self, mock_get_ascend_config): - mock_config = MagicMock() - mock_config.torchair_graph_config.enabled = False - - mock_get_ascend_config.return_value = mock_config - - result = self.platform.get_attn_backend_cls( - selected_backend="ascend", - head_size=64, - dtype="float16", - kv_cache_dtype="float16", - block_size=64, - use_v1=False, - use_mla=False, - ) - self.assertEqual( - result, "vllm_ascend.attention.attention.AscendAttentionBackend") - def test_get_punica_wrapper(self): result = self.platform.get_punica_wrapper() self.assertEqual( diff --git a/tests/ut/worker/test_pooling_model_runner.py b/tests/ut/worker/test_pooling_model_runner.py deleted file mode 100644 index 28a0a7d..0000000 --- a/tests/ut/worker/test_pooling_model_runner.py +++ /dev/null @@ -1,355 +0,0 @@ -import unittest -from unittest.mock import MagicMock, patch - -import torch -from vllm.distributed.parallel_state import GroupCoordinator -from vllm.engine.arg_utils import EngineArgs -from vllm.pooling_params import PoolingParams -from vllm.sequence import SequenceData, SequenceGroupMetadata - -from vllm_ascend.worker.pooling_model_runner import ( - ModelInputForNPUWithPoolingMetadata, NPUPoolingModelRunner) - - -class TestPoolingModelRunner(unittest.TestCase): - """Unit tests for the NPUPoolingModelRunner class.""" - - def _create_model_runner(self, model: str, *args, - **kwargs) -> NPUPoolingModelRunner: - engine_args = EngineArgs(model, *args, **kwargs) - engine_config = engine_args.create_engine_config() - model_runner = NPUPoolingModelRunner(vllm_config=engine_config, ) - return model_runner - - def setUp(self): - """Initialize test fixtures and common mocks""" - self.attn_backend = "npu" - - model_runner = self._create_model_runner( - "tests/ut/fake_weight", - trust_remote_code=True, - enable_chunked_prefill=False, - ) - - self.runner = model_runner - self.runner.attn_backend = self.attn_backend - model_runner.model = MagicMock() - self.runner = model_runner - # Sample test data - self.sample_tensor_dict = {"tensor1": torch.randn(3, 4)} - self.sample_seq_group = [MagicMock(spec=SequenceGroupMetadata)] - self.sample_finished_ids = ["req1", "req2"] - - @patch( - 'vllm_ascend.worker.pooling_model_runner.ModelInputForNPUWithPoolingMetadata.from_broadcasted_tensor_dict' - ) - def test_make_model_input_from_broadcasted_tensor_dict( - self, mock_from_dict): - """Test tensor dictionary conversion to model input""" - # Setup mock return - expected_output = MagicMock() - mock_from_dict.return_value = expected_output - - # Execute - result = self.runner.make_model_input_from_broadcasted_tensor_dict( - self.sample_tensor_dict) - - # Verify - mock_from_dict.assert_called_once_with(self.sample_tensor_dict, - attn_backend=self.attn_backend) - self.assertEqual(result, expected_output) - - @patch.object(NPUPoolingModelRunner, '_prepare_pooling') - @patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors') - def test_prepare_model_input_normal_case(self, mock_prepare_tensors, - mock_prepare_pooling): - """Test normal flow of model input preparation""" - # Setup mocks - mock_model_input = ModelInputForNPUWithPoolingMetadata( - seq_lens=[1, 2, 3]) - mock_prepare_tensors.return_value = mock_model_input - - mock_pooling_metadata = MagicMock() - mock_prepare_pooling.return_value = mock_pooling_metadata - - # Execute - result = self.runner.prepare_model_input( - seq_group_metadata_list=self.sample_seq_group, - finished_requests_ids=self.sample_finished_ids) - - # Verify - mock_prepare_tensors.assert_called_once_with(self.sample_seq_group, - self.sample_finished_ids) - mock_prepare_pooling.assert_called_once_with(self.sample_seq_group, - mock_model_input.seq_lens) - self.assertEqual(result.pooling_metadata, mock_pooling_metadata) - - def test_prepare_model_input_null_sequence_group(self): - """Test assertion when seq_group_metadata_list is None""" - with self.assertRaises(AssertionError): - self.runner.prepare_model_input( - seq_group_metadata_list=None, - finished_requests_ids=self.sample_finished_ids) - - @patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors') - def test_prepare_model_input_null_seq_lens(self, mock_prepare_tensors): - """Test assertion when seq_lens is None in model input""" - # Setup mock with None seq_lens - mock_model_input = MagicMock() - mock_model_input.seq_lens = None - mock_prepare_tensors.return_value = mock_model_input - - with self.assertRaises(AssertionError): - self.runner.prepare_model_input( - seq_group_metadata_list=self.sample_seq_group, - finished_requests_ids=self.sample_finished_ids) - - @patch.object(NPUPoolingModelRunner, '_prepare_pooling') - @patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors') - def test_prepare_model_input_with_virtual_engine(self, - mock_prepare_tensors, - mock_prepare_pooling): - """Test virtual engine parameter is properly handled""" - # Setup mocks - mock_model_input = ModelInputForNPUWithPoolingMetadata( - seq_lens=[1, 2, 3]) - mock_prepare_tensors.return_value = mock_model_input - - # Execute with virtual_engine parameter - result = self.runner.prepare_model_input( - seq_group_metadata_list=self.sample_seq_group, - virtual_engine=1, - finished_requests_ids=self.sample_finished_ids) - - # Verify virtual_engine doesn't affect the flow - self.assertIsNotNone(result) - - @patch.object(NPUPoolingModelRunner, '_prepare_pooling') - @patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors') - def test_prepare_model_input_with_null_finished_ids( - self, mock_prepare_tensors, mock_prepare_pooling): - """Test case when finished_requests_ids is None""" - # Setup mocks - mock_model_input = ModelInputForNPUWithPoolingMetadata( - seq_lens=[1, 2, 3]) - mock_prepare_tensors.return_value = mock_model_input - - # Execute with None finished_ids - result = self.runner.prepare_model_input( - seq_group_metadata_list=self.sample_seq_group, - finished_requests_ids=None) - - # Verify - mock_prepare_tensors.assert_called_once_with(self.sample_seq_group, - None) - self.assertIsNotNone(result) - - @patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__') - def test_prepare_pooling_normal_case(self, mock_pooling_metadata): - """Test normal case with multiple sequences in group""" - # Setup test data - mock_pooling_metadata.return_value = None - seq_data = { - 1: MagicMock(spec=SequenceData), - 2: MagicMock(spec=SequenceData) - } - pooling_params = MagicMock(spec=PoolingParams) - seq_group = MagicMock(spec=SequenceGroupMetadata) - seq_group.seq_data = seq_data - seq_group.pooling_params = pooling_params - - # Call the function - self.runner._prepare_pooling([seq_group], [10, 20]) - - # Verify results - mock_pooling_metadata.assert_called_once_with(seq_groups=[ - ([1, 2], pooling_params) - ], - seq_data=seq_data, - prompt_lens=[10, 20]) - - @patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__') - def test_prepare_pooling_empty_group(self, mock_pooling_metadata): - """Test case with empty sequence group""" - # Setup empty group - mock_pooling_metadata.return_value = None - empty_seq_data: dict[int, SequenceData] = {} - pooling_params = MagicMock(spec=PoolingParams) - empty_group = MagicMock(spec=SequenceGroupMetadata) - empty_group.seq_data = empty_seq_data - empty_group.pooling_params = pooling_params - - # Call the function - self.runner._prepare_pooling([empty_group], []) - - # Verify results - mock_pooling_metadata.assert_called_once_with(seq_groups=[ - ([], pooling_params) - ], - seq_data={}, - prompt_lens=[]) - - @patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__') - def test_prepare_pooling_single_sequence(self, mock_pooling_metadata): - """Test case with single sequence in group""" - # Setup single sequence - mock_pooling_metadata.return_value = None - single_seq_data = {3: MagicMock(spec=SequenceData)} - pooling_params = MagicMock(spec=PoolingParams) - single_group = MagicMock(spec=SequenceGroupMetadata) - single_group.seq_data = single_seq_data - single_group.pooling_params = pooling_params - - # Call the function - self.runner._prepare_pooling([single_group], [5]) - - # Verify results - mock_pooling_metadata.assert_called_once_with(seq_groups=[ - ([3], pooling_params) - ], - seq_data=single_seq_data, - prompt_lens=[5]) - - @patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__') - def test_prepare_pooling_multiple_groups(self, mock_pooling_metadata): - """Test case with multiple sequence groups""" - # Setup multiple groups - mock_pooling_metadata.return_value = None - seq_data1 = {1: MagicMock(spec=SequenceData)} - seq_data2 = {2: MagicMock(spec=SequenceData)} - params1 = MagicMock(spec=PoolingParams) - params2 = MagicMock(spec=PoolingParams) - - group1 = MagicMock(spec=SequenceGroupMetadata) - group1.seq_data = seq_data1 - group1.pooling_params = params1 - - group2 = MagicMock(spec=SequenceGroupMetadata) - group2.seq_data = seq_data2 - group2.pooling_params = params2 - - # Call the function - self.runner._prepare_pooling([group1, group2], [10, 20]) - - # Verify results - mock_pooling_metadata.assert_called_once_with(seq_groups=[ - ([1], params1), ([2], params2) - ], - seq_data={ - **seq_data1, - **seq_data2 - }, - prompt_lens=[10, 20]) - - @patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__') - def test_prepare_pooling_empty_input(self, mock_pooling_metadata): - """Test case with empty input lists""" - # Call the function with empty inputs - mock_pooling_metadata.return_value = None - self.runner._prepare_pooling([], []) - - # Verify results - mock_pooling_metadata.assert_called_once_with(seq_groups=[], - seq_data={}, - prompt_lens=[]) - - @patch('vllm.forward_context.set_forward_context') - @patch('vllm.distributed.parallel_state._PP', - new_callable=lambda: MagicMock(spec=GroupCoordinator, - is_last_rank=True)) - @patch('torch.npu.Event') - @patch.object(NPUPoolingModelRunner, 'set_active_loras') - @patch.object(NPUPoolingModelRunner, 'set_active_prompt_adapters') - def test_execute_model_normal_flow(self, mock_set_adapters, mock_set_loras, - mock_event, mock_pp, mock_set_forward): - """Test normal execution path with all dependencies mocked""" - - # Setup model input mock - mock_input = MagicMock() - mock_input.input_tokens = torch.tensor([1]) - mock_input.input_positions = torch.tensor([0]) - mock_input.multi_modal_kwargs = {} - self.runner.is_driver_worker = True - # Execute - self.runner.execute_model(model_input=mock_input, - kv_caches=[], - num_steps=1) - - # Verify core calls - self.runner.model.pooler.assert_called_once() - - @patch('vllm.forward_context.set_forward_context') - def test_execute_model_invalid_steps(self, mock_set_forward): - """Test ValueError when num_steps != 1""" - with self.assertRaises(ValueError): - self.runner.execute_model(model_input=MagicMock(), - kv_caches=[], - num_steps=2) - mock_set_forward.assert_not_called() - - @patch('vllm.forward_context.set_forward_context') - @patch('vllm.distributed.parallel_state._PP', - new_callable=lambda: MagicMock(spec=GroupCoordinator, - is_last_rank=False)) - @patch('torch.npu.Event') - def test_execute_model_perf_monitoring(self, mock_event, mock_pp, - mock_set_forward): - """Test performance monitoring with timing mocks""" - # Setup mocks - - mock_event.return_value.elapsed_time.return_value = 15.0 - self.runner.observability_config = MagicMock( - collect_model_forward_time=True) - - # Execute - self.runner.execute_model(model_input=MagicMock( - input_tokens=torch.tensor([1]), - input_positions=torch.tensor([0]), - multi_modal_kwargs={}), - kv_caches=[], - num_steps=1) - - # Verify timing calls - self.assertEqual(mock_event.call_count, 2) - - @patch('vllm.forward_context.set_forward_context') - @patch.object(NPUPoolingModelRunner, 'set_active_loras') - @patch('vllm.distributed.parallel_state._PP', - new_callable=lambda: MagicMock(spec=GroupCoordinator, - is_last_rank=False)) - def test_execute_model_lora_config(self, mock_pp, set_active_loras, - mock_set_forward): - """Test LoRA configuration handling""" - # Setup - - self.runner.lora_config = True - mock_input = MagicMock() - mock_input.lora_requests = ["req1"] - mock_input.lora_mapping = {"map": 1} - - # Execute - self.runner.execute_model(model_input=mock_input, - kv_caches=[], - num_steps=1) - - # Verify LoRA call - set_active_loras.assert_called_once_with(["req1"], {"map": 1}) - - @patch('vllm.forward_context.set_forward_context') - @patch('vllm.distributed.parallel_state._PP', - new_callable=lambda: MagicMock(spec=GroupCoordinator, - is_last_rank=False)) - def test_execute_model_not_last_rank(self, mock_pp, mock_set_forward): - """Test behavior when not the last pipeline rank""" - # Setup - - # Execute - self.runner.execute_model(model_input=MagicMock( - input_tokens=torch.tensor([1]), - input_positions=torch.tensor([0]), - multi_modal_kwargs={}), - kv_caches=[], - num_steps=1) - - # Verify pooler not called - self.runner.model.pooler.assert_not_called() diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index fede975..f13ed49 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -117,7 +117,7 @@ class NPUPlatform(Platform): @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: if not envs.VLLM_USE_V1: - raise ValueError("vLLM Ascend does not support V0 engine") + raise ValueError("vLLM Ascend does not support V0 engine.") # initialize ascend config from vllm additional_config ascend_config = init_ascend_config(vllm_config) @@ -208,16 +208,16 @@ class NPUPlatform(Platform): @classmethod def get_attn_backend_cls(cls, selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1, use_mla): - if use_v1 and use_mla: - return "vllm_ascend.attention.mla_v1.AscendMLABackend" + if not use_v1: + raise ValueError("vLLM Ascend does not support V0 engine.") + use_torchair = get_ascend_config().torchair_graph_config.enabled - if use_v1 and use_torchair: - return "vllm_ascend.attention.attention_v1_torchair.AscendAttentionTorchairBackend" - if use_v1: - return "vllm_ascend.attention.attention_v1.AscendAttentionBackend" if use_mla: - return "vllm_ascend.attention.attention.AscendMLAAttentionBackend" - return "vllm_ascend.attention.attention.AscendAttentionBackend" + return "vllm_ascend.attention.mla_v1.AscendMLABackend" + elif use_torchair: + return "vllm_ascend.attention.attention_v1_torchair.AscendAttentionTorchairBackend" + else: + return "vllm_ascend.attention.attention_v1.AscendAttentionBackend" @classmethod def get_punica_wrapper(cls) -> str: