[Misc][V0 Deprecation] Remove V0 related codes of test, example, platform (#1805)

### What this PR does / why we need it?
Remove V0 related codes of test, example, platform.

This PR is a part of
https://github.com/vllm-project/vllm-ascend/issues/1620.

- vLLM version: v0.9.2
- vLLM main:
235bfd5dfe

---------

Signed-off-by: shen-shanshan <467638484@qq.com>
This commit is contained in:
Shanshan Shen
2025-07-15 19:58:55 +08:00
committed by GitHub
parent a929699e98
commit f96100fad5
5 changed files with 10 additions and 460 deletions

View File

@@ -1,44 +0,0 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
# Adapted from vllm-project/vllm/examples/offline_inference/basic.py
#
import os
os.environ["VLLM_USE_V1"] = "0"
os.environ["VLLM_USE_MODELSCOPE"] = "True"
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
# Create an LLM.
llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
# Generate texts from the prompts.
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

View File

@@ -30,10 +30,7 @@ from tests.e2e.conftest import VllmRunner
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
GuidedDecodingBackendV0 = ["outlines", "lm-format-enforcer", "xgrammar"]
GuidedDecodingBackendV1 = ["xgrammar", "guidance"]
GuidedDecodingBackend = list(
set(GuidedDecodingBackendV0 + GuidedDecodingBackendV1))
GuidedDecodingBackend = ["xgrammar", "guidance"]
@pytest.fixture(scope="module")
@@ -84,16 +81,9 @@ def sample_json_schema():
}
def check_backend(guided_decoding_backend: str):
if guided_decoding_backend not in GuidedDecodingBackendV1:
pytest.skip(f"{guided_decoding_backend} does not support v1, skip it.")
@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
def test_guided_json_completion(guided_decoding_backend: str,
sample_json_schema):
check_backend(guided_decoding_backend)
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=500,
@@ -130,8 +120,6 @@ def test_guided_json_completion(guided_decoding_backend: str,
@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
def test_guided_regex(guided_decoding_backend: str, sample_regex):
check_backend(guided_decoding_backend)
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,

View File

@@ -481,45 +481,6 @@ class TestNPUPlatform(TestBase):
result,
"vllm_ascend.attention.attention_v1.AscendAttentionBackend")
@patch('vllm_ascend.platform.get_ascend_config')
def test_get_attn_backend_cls_use_mla_only(self, mock_get_ascend_config):
mock_config = MagicMock()
mock_config.torchair_graph_config.enabled = False
mock_get_ascend_config.return_value = mock_config
result = self.platform.get_attn_backend_cls(
selected_backend="ascend",
head_size=64,
dtype="float16",
kv_cache_dtype="float16",
block_size=64,
use_v1=False,
use_mla=True,
)
self.assertEqual(
result,
"vllm_ascend.attention.attention.AscendMLAAttentionBackend")
@patch('vllm_ascend.platform.get_ascend_config')
def test_get_attn_backend_cls_default_case(self, mock_get_ascend_config):
mock_config = MagicMock()
mock_config.torchair_graph_config.enabled = False
mock_get_ascend_config.return_value = mock_config
result = self.platform.get_attn_backend_cls(
selected_backend="ascend",
head_size=64,
dtype="float16",
kv_cache_dtype="float16",
block_size=64,
use_v1=False,
use_mla=False,
)
self.assertEqual(
result, "vllm_ascend.attention.attention.AscendAttentionBackend")
def test_get_punica_wrapper(self):
result = self.platform.get_punica_wrapper()
self.assertEqual(

View File

@@ -1,355 +0,0 @@
import unittest
from unittest.mock import MagicMock, patch
import torch
from vllm.distributed.parallel_state import GroupCoordinator
from vllm.engine.arg_utils import EngineArgs
from vllm.pooling_params import PoolingParams
from vllm.sequence import SequenceData, SequenceGroupMetadata
from vllm_ascend.worker.pooling_model_runner import (
ModelInputForNPUWithPoolingMetadata, NPUPoolingModelRunner)
class TestPoolingModelRunner(unittest.TestCase):
"""Unit tests for the NPUPoolingModelRunner class."""
def _create_model_runner(self, model: str, *args,
**kwargs) -> NPUPoolingModelRunner:
engine_args = EngineArgs(model, *args, **kwargs)
engine_config = engine_args.create_engine_config()
model_runner = NPUPoolingModelRunner(vllm_config=engine_config, )
return model_runner
def setUp(self):
"""Initialize test fixtures and common mocks"""
self.attn_backend = "npu"
model_runner = self._create_model_runner(
"tests/ut/fake_weight",
trust_remote_code=True,
enable_chunked_prefill=False,
)
self.runner = model_runner
self.runner.attn_backend = self.attn_backend
model_runner.model = MagicMock()
self.runner = model_runner
# Sample test data
self.sample_tensor_dict = {"tensor1": torch.randn(3, 4)}
self.sample_seq_group = [MagicMock(spec=SequenceGroupMetadata)]
self.sample_finished_ids = ["req1", "req2"]
@patch(
'vllm_ascend.worker.pooling_model_runner.ModelInputForNPUWithPoolingMetadata.from_broadcasted_tensor_dict'
)
def test_make_model_input_from_broadcasted_tensor_dict(
self, mock_from_dict):
"""Test tensor dictionary conversion to model input"""
# Setup mock return
expected_output = MagicMock()
mock_from_dict.return_value = expected_output
# Execute
result = self.runner.make_model_input_from_broadcasted_tensor_dict(
self.sample_tensor_dict)
# Verify
mock_from_dict.assert_called_once_with(self.sample_tensor_dict,
attn_backend=self.attn_backend)
self.assertEqual(result, expected_output)
@patch.object(NPUPoolingModelRunner, '_prepare_pooling')
@patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors')
def test_prepare_model_input_normal_case(self, mock_prepare_tensors,
mock_prepare_pooling):
"""Test normal flow of model input preparation"""
# Setup mocks
mock_model_input = ModelInputForNPUWithPoolingMetadata(
seq_lens=[1, 2, 3])
mock_prepare_tensors.return_value = mock_model_input
mock_pooling_metadata = MagicMock()
mock_prepare_pooling.return_value = mock_pooling_metadata
# Execute
result = self.runner.prepare_model_input(
seq_group_metadata_list=self.sample_seq_group,
finished_requests_ids=self.sample_finished_ids)
# Verify
mock_prepare_tensors.assert_called_once_with(self.sample_seq_group,
self.sample_finished_ids)
mock_prepare_pooling.assert_called_once_with(self.sample_seq_group,
mock_model_input.seq_lens)
self.assertEqual(result.pooling_metadata, mock_pooling_metadata)
def test_prepare_model_input_null_sequence_group(self):
"""Test assertion when seq_group_metadata_list is None"""
with self.assertRaises(AssertionError):
self.runner.prepare_model_input(
seq_group_metadata_list=None,
finished_requests_ids=self.sample_finished_ids)
@patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors')
def test_prepare_model_input_null_seq_lens(self, mock_prepare_tensors):
"""Test assertion when seq_lens is None in model input"""
# Setup mock with None seq_lens
mock_model_input = MagicMock()
mock_model_input.seq_lens = None
mock_prepare_tensors.return_value = mock_model_input
with self.assertRaises(AssertionError):
self.runner.prepare_model_input(
seq_group_metadata_list=self.sample_seq_group,
finished_requests_ids=self.sample_finished_ids)
@patch.object(NPUPoolingModelRunner, '_prepare_pooling')
@patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors')
def test_prepare_model_input_with_virtual_engine(self,
mock_prepare_tensors,
mock_prepare_pooling):
"""Test virtual engine parameter is properly handled"""
# Setup mocks
mock_model_input = ModelInputForNPUWithPoolingMetadata(
seq_lens=[1, 2, 3])
mock_prepare_tensors.return_value = mock_model_input
# Execute with virtual_engine parameter
result = self.runner.prepare_model_input(
seq_group_metadata_list=self.sample_seq_group,
virtual_engine=1,
finished_requests_ids=self.sample_finished_ids)
# Verify virtual_engine doesn't affect the flow
self.assertIsNotNone(result)
@patch.object(NPUPoolingModelRunner, '_prepare_pooling')
@patch.object(NPUPoolingModelRunner, '_prepare_model_input_tensors')
def test_prepare_model_input_with_null_finished_ids(
self, mock_prepare_tensors, mock_prepare_pooling):
"""Test case when finished_requests_ids is None"""
# Setup mocks
mock_model_input = ModelInputForNPUWithPoolingMetadata(
seq_lens=[1, 2, 3])
mock_prepare_tensors.return_value = mock_model_input
# Execute with None finished_ids
result = self.runner.prepare_model_input(
seq_group_metadata_list=self.sample_seq_group,
finished_requests_ids=None)
# Verify
mock_prepare_tensors.assert_called_once_with(self.sample_seq_group,
None)
self.assertIsNotNone(result)
@patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__')
def test_prepare_pooling_normal_case(self, mock_pooling_metadata):
"""Test normal case with multiple sequences in group"""
# Setup test data
mock_pooling_metadata.return_value = None
seq_data = {
1: MagicMock(spec=SequenceData),
2: MagicMock(spec=SequenceData)
}
pooling_params = MagicMock(spec=PoolingParams)
seq_group = MagicMock(spec=SequenceGroupMetadata)
seq_group.seq_data = seq_data
seq_group.pooling_params = pooling_params
# Call the function
self.runner._prepare_pooling([seq_group], [10, 20])
# Verify results
mock_pooling_metadata.assert_called_once_with(seq_groups=[
([1, 2], pooling_params)
],
seq_data=seq_data,
prompt_lens=[10, 20])
@patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__')
def test_prepare_pooling_empty_group(self, mock_pooling_metadata):
"""Test case with empty sequence group"""
# Setup empty group
mock_pooling_metadata.return_value = None
empty_seq_data: dict[int, SequenceData] = {}
pooling_params = MagicMock(spec=PoolingParams)
empty_group = MagicMock(spec=SequenceGroupMetadata)
empty_group.seq_data = empty_seq_data
empty_group.pooling_params = pooling_params
# Call the function
self.runner._prepare_pooling([empty_group], [])
# Verify results
mock_pooling_metadata.assert_called_once_with(seq_groups=[
([], pooling_params)
],
seq_data={},
prompt_lens=[])
@patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__')
def test_prepare_pooling_single_sequence(self, mock_pooling_metadata):
"""Test case with single sequence in group"""
# Setup single sequence
mock_pooling_metadata.return_value = None
single_seq_data = {3: MagicMock(spec=SequenceData)}
pooling_params = MagicMock(spec=PoolingParams)
single_group = MagicMock(spec=SequenceGroupMetadata)
single_group.seq_data = single_seq_data
single_group.pooling_params = pooling_params
# Call the function
self.runner._prepare_pooling([single_group], [5])
# Verify results
mock_pooling_metadata.assert_called_once_with(seq_groups=[
([3], pooling_params)
],
seq_data=single_seq_data,
prompt_lens=[5])
@patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__')
def test_prepare_pooling_multiple_groups(self, mock_pooling_metadata):
"""Test case with multiple sequence groups"""
# Setup multiple groups
mock_pooling_metadata.return_value = None
seq_data1 = {1: MagicMock(spec=SequenceData)}
seq_data2 = {2: MagicMock(spec=SequenceData)}
params1 = MagicMock(spec=PoolingParams)
params2 = MagicMock(spec=PoolingParams)
group1 = MagicMock(spec=SequenceGroupMetadata)
group1.seq_data = seq_data1
group1.pooling_params = params1
group2 = MagicMock(spec=SequenceGroupMetadata)
group2.seq_data = seq_data2
group2.pooling_params = params2
# Call the function
self.runner._prepare_pooling([group1, group2], [10, 20])
# Verify results
mock_pooling_metadata.assert_called_once_with(seq_groups=[
([1], params1), ([2], params2)
],
seq_data={
**seq_data1,
**seq_data2
},
prompt_lens=[10, 20])
@patch('vllm.model_executor.pooling_metadata.PoolingMetadata.__init__')
def test_prepare_pooling_empty_input(self, mock_pooling_metadata):
"""Test case with empty input lists"""
# Call the function with empty inputs
mock_pooling_metadata.return_value = None
self.runner._prepare_pooling([], [])
# Verify results
mock_pooling_metadata.assert_called_once_with(seq_groups=[],
seq_data={},
prompt_lens=[])
@patch('vllm.forward_context.set_forward_context')
@patch('vllm.distributed.parallel_state._PP',
new_callable=lambda: MagicMock(spec=GroupCoordinator,
is_last_rank=True))
@patch('torch.npu.Event')
@patch.object(NPUPoolingModelRunner, 'set_active_loras')
@patch.object(NPUPoolingModelRunner, 'set_active_prompt_adapters')
def test_execute_model_normal_flow(self, mock_set_adapters, mock_set_loras,
mock_event, mock_pp, mock_set_forward):
"""Test normal execution path with all dependencies mocked"""
# Setup model input mock
mock_input = MagicMock()
mock_input.input_tokens = torch.tensor([1])
mock_input.input_positions = torch.tensor([0])
mock_input.multi_modal_kwargs = {}
self.runner.is_driver_worker = True
# Execute
self.runner.execute_model(model_input=mock_input,
kv_caches=[],
num_steps=1)
# Verify core calls
self.runner.model.pooler.assert_called_once()
@patch('vllm.forward_context.set_forward_context')
def test_execute_model_invalid_steps(self, mock_set_forward):
"""Test ValueError when num_steps != 1"""
with self.assertRaises(ValueError):
self.runner.execute_model(model_input=MagicMock(),
kv_caches=[],
num_steps=2)
mock_set_forward.assert_not_called()
@patch('vllm.forward_context.set_forward_context')
@patch('vllm.distributed.parallel_state._PP',
new_callable=lambda: MagicMock(spec=GroupCoordinator,
is_last_rank=False))
@patch('torch.npu.Event')
def test_execute_model_perf_monitoring(self, mock_event, mock_pp,
mock_set_forward):
"""Test performance monitoring with timing mocks"""
# Setup mocks
mock_event.return_value.elapsed_time.return_value = 15.0
self.runner.observability_config = MagicMock(
collect_model_forward_time=True)
# Execute
self.runner.execute_model(model_input=MagicMock(
input_tokens=torch.tensor([1]),
input_positions=torch.tensor([0]),
multi_modal_kwargs={}),
kv_caches=[],
num_steps=1)
# Verify timing calls
self.assertEqual(mock_event.call_count, 2)
@patch('vllm.forward_context.set_forward_context')
@patch.object(NPUPoolingModelRunner, 'set_active_loras')
@patch('vllm.distributed.parallel_state._PP',
new_callable=lambda: MagicMock(spec=GroupCoordinator,
is_last_rank=False))
def test_execute_model_lora_config(self, mock_pp, set_active_loras,
mock_set_forward):
"""Test LoRA configuration handling"""
# Setup
self.runner.lora_config = True
mock_input = MagicMock()
mock_input.lora_requests = ["req1"]
mock_input.lora_mapping = {"map": 1}
# Execute
self.runner.execute_model(model_input=mock_input,
kv_caches=[],
num_steps=1)
# Verify LoRA call
set_active_loras.assert_called_once_with(["req1"], {"map": 1})
@patch('vllm.forward_context.set_forward_context')
@patch('vllm.distributed.parallel_state._PP',
new_callable=lambda: MagicMock(spec=GroupCoordinator,
is_last_rank=False))
def test_execute_model_not_last_rank(self, mock_pp, mock_set_forward):
"""Test behavior when not the last pipeline rank"""
# Setup
# Execute
self.runner.execute_model(model_input=MagicMock(
input_tokens=torch.tensor([1]),
input_positions=torch.tensor([0]),
multi_modal_kwargs={}),
kv_caches=[],
num_steps=1)
# Verify pooler not called
self.runner.model.pooler.assert_not_called()

View File

@@ -117,7 +117,7 @@ class NPUPlatform(Platform):
@classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if not envs.VLLM_USE_V1:
raise ValueError("vLLM Ascend does not support V0 engine")
raise ValueError("vLLM Ascend does not support V0 engine.")
# initialize ascend config from vllm additional_config
ascend_config = init_ascend_config(vllm_config)
@@ -208,16 +208,16 @@ class NPUPlatform(Platform):
@classmethod
def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
kv_cache_dtype, block_size, use_v1, use_mla):
if use_v1 and use_mla:
return "vllm_ascend.attention.mla_v1.AscendMLABackend"
if not use_v1:
raise ValueError("vLLM Ascend does not support V0 engine.")
use_torchair = get_ascend_config().torchair_graph_config.enabled
if use_v1 and use_torchair:
return "vllm_ascend.attention.attention_v1_torchair.AscendAttentionTorchairBackend"
if use_v1:
return "vllm_ascend.attention.attention_v1.AscendAttentionBackend"
if use_mla:
return "vllm_ascend.attention.attention.AscendMLAAttentionBackend"
return "vllm_ascend.attention.attention.AscendAttentionBackend"
return "vllm_ascend.attention.mla_v1.AscendMLABackend"
elif use_torchair:
return "vllm_ascend.attention.attention_v1_torchair.AscendAttentionTorchairBackend"
else:
return "vllm_ascend.attention.attention_v1.AscendAttentionBackend"
@classmethod
def get_punica_wrapper(cls) -> str: