[Misc] Remove VLLM_USE_V1 usage in code (#1764)

We plan to remove V0 code from this version. The first step is to delete
v0 usage.

Related: https://github.com/vllm-project/vllm-ascend/issues/1620

- vLLM version: v0.9.2
- vLLM main:
61e20828da

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-07-15 11:52:16 +08:00
committed by GitHub
parent 494b0f474f
commit 7bdada58eb
6 changed files with 100 additions and 217 deletions

View File

@@ -193,71 +193,48 @@ class TestAscendConfig(TestBase):
@_clean_up_ascend_config @_clean_up_ascend_config
def test_check_ascend_config_wrong_case(self): def test_check_ascend_config_wrong_case(self):
test_vllm_config = VllmConfig() test_vllm_config = VllmConfig()
# For V0 engine
with mock.patch.dict(os.environ, {"VLLM_USE_V1": "0"}): # torchair + eager mode
with self.assertRaises(NotImplementedError): with self.assertRaises(RuntimeError):
test_vllm_config.additional_config = { test_vllm_config.additional_config = {
"torchair_graph_config": { "torchair_graph_config": {
"enabled": True, "enabled": True,
}, },
"refresh": True "refresh": True
} }
init_ascend_config(test_vllm_config) init_ascend_config(test_vllm_config)
check_ascend_config(test_vllm_config, False) enforce_eager = True
with self.assertRaises(NotImplementedError): check_ascend_config(test_vllm_config, enforce_eager)
test_vllm_config.additional_config = { # torchair + non deepseek model
"ascend_scheduler_config": { with self.assertRaises(NotImplementedError):
"enabled": True, test_vllm_config.additional_config = {
}, "torchair_graph_config": {
"refresh": True "enabled": True,
} },
init_ascend_config(test_vllm_config) "refresh": True
check_ascend_config(test_vllm_config, True) }
# For V1 engine model_path = os.path.join(os.path.dirname(__file__), "fake_weight")
with mock.patch.dict(os.environ, {"VLLM_USE_V1": "1"}): fake_model_config = ModelConfig(model=model_path)
# torchair + eager mode fake_model_config.hf_config = PretrainedConfig()
with self.assertRaises(RuntimeError): fake_model_config.hf_config.model_type = "llama"
test_vllm_config.additional_config = { test_vllm_config.model_config = fake_model_config
"torchair_graph_config": { init_ascend_config(test_vllm_config)
"enabled": True, check_ascend_config(test_vllm_config, False)
}, # aclgraph + deepseek model
"refresh": True with self.assertRaises(NotImplementedError):
} test_vllm_config.additional_config = {
init_ascend_config(test_vllm_config) "torchair_graph_config": {
enforce_eager = True "enabled": False,
check_ascend_config(test_vllm_config, enforce_eager) },
# torchair + non deepseek model "refresh": True
with self.assertRaises(NotImplementedError): }
test_vllm_config.additional_config = { model_path = os.path.join(os.path.dirname(__file__), "fake_weight")
"torchair_graph_config": { fake_model_config = ModelConfig(model=model_path)
"enabled": True, fake_model_config.hf_config = PretrainedConfig()
}, fake_model_config.hf_config.model_type = "deepseek"
"refresh": True test_vllm_config.model_config = fake_model_config
} init_ascend_config(test_vllm_config)
model_path = os.path.join(os.path.dirname(__file__), check_ascend_config(test_vllm_config, False)
"fake_weight")
fake_model_config = ModelConfig(model=model_path)
fake_model_config.hf_config = PretrainedConfig()
fake_model_config.hf_config.model_type = "llama"
test_vllm_config.model_config = fake_model_config
init_ascend_config(test_vllm_config)
check_ascend_config(test_vllm_config, False)
# aclgraph + deepseek model
with self.assertRaises(NotImplementedError):
test_vllm_config.additional_config = {
"torchair_graph_config": {
"enabled": False,
},
"refresh": True
}
model_path = os.path.join(os.path.dirname(__file__),
"fake_weight")
fake_model_config = ModelConfig(model=model_path)
fake_model_config.hf_config = PretrainedConfig()
fake_model_config.hf_config.model_type = "deepseek"
test_vllm_config.model_config = fake_model_config
init_ascend_config(test_vllm_config)
check_ascend_config(test_vllm_config, False)
def test_check_torchair_supported(self): def test_check_torchair_supported(self):
test_cases = [('deepseek_v3', True), ('PanguProMoE', True), test_cases = [('deepseek_v3', True), ('PanguProMoE', True),

View File

@@ -389,69 +389,6 @@ class TestNPUPlatform(TestBase):
"vllm_ascend.worker.worker_v1.NPUWorker", "vllm_ascend.worker.worker_v1.NPUWorker",
) )
@patch("vllm_ascend.ascend_config.check_ascend_config")
@patch("vllm_ascend.ascend_config.init_ascend_config")
@patch("vllm.envs.VLLM_USE_V1", False)
def test_check_and_update_config_speculative_worker_config(
self, mock_init_ascend, mock_check_ascend):
mock_init_ascend.return_value = self.mock_ascend_config
self.mock_vllm_config.speculative_config = MagicMock()
self.mock_vllm_config.speculative_config.disable_logprobs = True
self.mock_vllm_config.parallel_config.worker_cls = "auto"
with patch.dict("os.environ", {}):
from vllm_ascend import platform
importlib.reload(platform)
self.platform.check_and_update_config(self.mock_vllm_config)
import os
self.assertEqual(os.environ.get("ACL_OP_INIT_MODE"), "1")
self.assertEqual(
self.mock_vllm_config.parallel_config.worker_cls,
"vllm.spec_decode.spec_decode_worker.create_spec_worker",
)
self.assertEqual(
self.mock_vllm_config.parallel_config.sd_worker_cls,
"vllm_ascend.worker.worker.NPUWorker",
)
@patch("vllm_ascend.ascend_config.check_ascend_config")
@patch("vllm_ascend.ascend_config.init_ascend_config")
@patch("vllm.envs.VLLM_USE_V1", False)
def test_check_and_update_config_multi_step_worker_config(
self, mock_init_ascend, mock_check_ascend):
mock_init_ascend.return_value = self.mock_ascend_config
self.mock_vllm_config.scheduler_config.is_multi_step = True
self.mock_vllm_config.parallel_config.worker_cls = "auto"
from vllm_ascend import platform
importlib.reload(platform)
self.platform.check_and_update_config(self.mock_vllm_config)
self.assertEqual(
self.mock_vllm_config.parallel_config.worker_cls,
"vllm_ascend.worker.multi_step_worker.MultiStepWorker",
)
@patch("vllm_ascend.ascend_config.check_ascend_config")
@patch("vllm_ascend.ascend_config.init_ascend_config")
@patch("vllm.envs.VLLM_USE_V1", False)
def test_check_and_update_config_default_worker_config(
self, mock_init_ascend, mock_check_ascend):
mock_init_ascend.return_value = self.mock_ascend_config
self.mock_vllm_config.parallel_config.worker_cls = "auto"
self.mock_vllm_config.scheduler_config.is_multi_step = False
from vllm_ascend import platform
importlib.reload(platform)
self.platform.check_and_update_config(self.mock_vllm_config)
self.assertEqual(
self.mock_vllm_config.parallel_config.worker_cls,
"vllm_ascend.worker.worker.NPUWorker",
)
@patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.check_ascend_config")
@patch("vllm_ascend.ascend_config.init_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config")
@patch("vllm_ascend.utils.is_310p", return_value=True) @patch("vllm_ascend.utils.is_310p", return_value=True)

View File

@@ -15,7 +15,6 @@
# limitations under the License. # limitations under the License.
from typing import Optional from typing import Optional
import vllm.envs as envs
from vllm.logger import logger from vllm.logger import logger
TORCHAIR_MODEL_LIST = ["deepseek", "pangu"] TORCHAIR_MODEL_LIST = ["deepseek", "pangu"]
@@ -126,46 +125,36 @@ def get_ascend_config():
def check_ascend_config(vllm_config, enforce_eager): def check_ascend_config(vllm_config, enforce_eager):
ascend_config = get_ascend_config() ascend_config = get_ascend_config()
# for v0 engine # for eager mode
if not envs.VLLM_USE_V1: if enforce_eager:
# torchair_graph cannot be enabled with eager mode.
if ascend_config.torchair_graph_config.enabled: if ascend_config.torchair_graph_config.enabled:
raise NotImplementedError( raise RuntimeError(
"Torchair graph mode is only supported for V1 Engine.") "Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode."
if ascend_config.ascend_scheduler_config.enabled: )
raise NotImplementedError( # for graph mode
"Ascend scheduler is only supported for V1 Engine.")
# for v1 engine
else: else:
# for eager mode # torchair_graph case
if enforce_eager: if ascend_config.torchair_graph_config.enabled:
# torchair_graph cannot be enabled with eager mode. # torchair_graph is supported for deepseek/pangu model only.
if ascend_config.torchair_graph_config.enabled: if vllm_config.model_config:
raise RuntimeError( model_type = vllm_config.model_config.hf_config.model_type
"Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode." if not _check_torchair_supported(model_type):
) raise NotImplementedError(
# for graph mode "Torchair graph mode only works with following model types:"
f"{TORCHAIR_MODEL_LIST}.")
# aclgraph case
else: else:
# torchair_graph case # aclgraph doesn't work with deepseek model and only qwen model is well tested.
if ascend_config.torchair_graph_config.enabled: if vllm_config.model_config:
# torchair_graph is supported for deepseek/pangu model only. model_type = vllm_config.model_config.hf_config.model_type
if vllm_config.model_config: if "deepseek" in model_type:
model_type = vllm_config.model_config.hf_config.model_type raise NotImplementedError(
if not _check_torchair_supported(model_type): "ACL Graph does not support deepseek. Please "
raise NotImplementedError( "try torchair graph mode to serve deepseek models on vllm-ascend."
"Torchair graph mode only works with following model types:" " Or set `enforce_eager=True` to use eager mode.")
f"{TORCHAIR_MODEL_LIST}.") if "qwen" not in model_type:
# aclgraph case logger.warning(
else: "ACL Graph is currently experimental. Please "
# aclgraph doesn't work with deepseek model and only qwen model is well tested. "raise an issue on https://github.com/vllm-project/vllm-ascend/issues"
if vllm_config.model_config: " if you encourage any Error")
model_type = vllm_config.model_config.hf_config.model_type
if "deepseek" in model_type:
raise NotImplementedError(
"ACL Graph does not support deepseek. Please "
"try torchair graph mode to serve deepseek models on vllm-ascend."
" Or set `enforce_eager=True` to use eager mode.")
if "qwen" not in model_type:
logger.warning(
"ACL Graph is currently experimental. Please "
"raise an issue on https://github.com/vllm-project/vllm-ascend/issues"
" if you encourage any Error")

View File

@@ -30,7 +30,6 @@ from typing import Any, Dict, Iterable, List, Optional, Union
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import torch_npu # noqa: F401 import torch_npu # noqa: F401
import vllm.envs as envs
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.attention import Attention, AttentionMetadata from vllm.attention import Attention, AttentionMetadata
@@ -397,20 +396,17 @@ class CustomDeepseekDBOMLAAttention(DeepseekV2MLAAttention):
hidden_states_or_q_c = hidden_states hidden_states_or_q_c = hidden_states
if self.torchair_graph_enabled: if self.torchair_graph_enabled:
forward_kwargs = {} forward_kwargs = {}
if envs.VLLM_USE_V1: output_shape = hidden_states.shape
output_shape = hidden_states.shape output = torch.empty(output_shape,
output = torch.empty(output_shape, dtype=hidden_states_or_q_c.dtype,
dtype=hidden_states_or_q_c.dtype, device=hidden_states_or_q_c.device)
device=hidden_states_or_q_c.device) forward_kwargs['output'] = output
forward_kwargs['output'] = output
output = self.mla_attn.impl.forward(self.mla_attn, output = self.mla_attn.impl.forward(self.mla_attn,
hidden_states_or_q_c, hidden_states_or_q_c,
hidden_states, None, kv_cache, hidden_states, None, kv_cache,
attn_metadata, attn_metadata,
**forward_kwargs) **forward_kwargs)
if envs.VLLM_USE_V1: output = output.view(-1, output_shape[-1])
output = output.view(-1, output_shape[-1])
return output return output
else: else:
kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split( kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
@@ -885,7 +881,7 @@ class CustomDeepseekDBOModel(nn.Module):
def can_run_ms(self): def can_run_ms(self):
attn_metadata = get_forward_context().attn_metadata attn_metadata = get_forward_context().attn_metadata
# support mla attention and V1 engine at present # support mla attention and V1 engine at present
if not self.use_mla or not envs.VLLM_USE_V1: if not self.use_mla:
return False return False
# enable prefill overlap # enable prefill overlap
if attn_metadata is None or attn_metadata.num_prefills == 0: if attn_metadata is None or attn_metadata.num_prefills == 0:

View File

@@ -29,7 +29,6 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
import torch import torch
import torch_npu import torch_npu
import vllm.envs as envs
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.attention import Attention, AttentionMetadata from vllm.attention import Attention, AttentionMetadata
@@ -579,20 +578,17 @@ class CustomDeepseekV2MLAAttention(DeepseekV2MLAAttention):
else: else:
hidden_states_or_q_c = hidden_states hidden_states_or_q_c = hidden_states
if self.torchair_graph_enabled: if self.torchair_graph_enabled:
if envs.VLLM_USE_V1: output_shape = hidden_states.shape
output_shape = hidden_states.shape output = torch.empty(output_shape,
output = torch.empty(output_shape, dtype=hidden_states_or_q_c.dtype,
dtype=hidden_states_or_q_c.dtype, device=hidden_states_or_q_c.device)
device=hidden_states_or_q_c.device) forward_kwargs['output'] = output
forward_kwargs['output'] = output
output = self.mla_attn.impl.forward(self.mla_attn, output = self.mla_attn.impl.forward(self.mla_attn,
hidden_states_or_q_c, hidden_states_or_q_c,
hidden_states, None, kv_cache, hidden_states, None, kv_cache,
attn_metadata, attn_metadata,
**forward_kwargs) **forward_kwargs)
if envs.VLLM_USE_V1: output = output.view(-1, output_shape[-1])
output = output.view(-1, output_shape[-1])
return output return output
else: else:
kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split( kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
@@ -660,7 +656,7 @@ class CustomDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):
prefix=f"{prefix}.mlp", prefix=f"{prefix}.mlp",
) )
self.mla_moe_communication = ascend_config.torchair_graph_config.enable_multistream_moe \ self.mla_moe_communication = ascend_config.torchair_graph_config.enable_multistream_moe \
and model_config.use_mla and envs.VLLM_USE_V1 and self.tp_size > 1 and model_config.use_mla and self.tp_size > 1
else: else:
self.mlp = CustomDeepseekV2MLP( self.mlp = CustomDeepseekV2MLP(
hidden_size=config.hidden_size, hidden_size=config.hidden_size,

View File

@@ -16,7 +16,6 @@
# #
import gc import gc
import os
from datetime import timedelta from datetime import timedelta
from typing import TYPE_CHECKING, Optional, Tuple from typing import TYPE_CHECKING, Optional, Tuple
@@ -117,6 +116,8 @@ class NPUPlatform(Platform):
@classmethod @classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None: def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if not envs.VLLM_USE_V1:
raise ValueError("vLLM Ascend does not support V0 engine")
# initialize ascend config from vllm additional_config # initialize ascend config from vllm additional_config
ascend_config = init_ascend_config(vllm_config) ascend_config = init_ascend_config(vllm_config)
@@ -180,18 +181,7 @@ class NPUPlatform(Platform):
update_aclgraph_sizes(vllm_config) update_aclgraph_sizes(vllm_config)
if parallel_config and parallel_config.worker_cls == "auto": if parallel_config and parallel_config.worker_cls == "auto":
if envs.VLLM_USE_V1: parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
elif vllm_config.speculative_config:
# NOTE: We set this var to `1` in vllm-ascend to avoid segment
# fault when using spec decode with V0 engine.
os.environ["ACL_OP_INIT_MODE"] = "1"
parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker"
parallel_config.sd_worker_cls = "vllm_ascend.worker.worker.NPUWorker"
elif vllm_config.scheduler_config.is_multi_step:
parallel_config.worker_cls = "vllm_ascend.worker.multi_step_worker.MultiStepWorker"
else:
parallel_config.worker_cls = "vllm_ascend.worker.worker.NPUWorker"
if cache_config: if cache_config:
if cache_config.block_size is None: if cache_config.block_size is None:
@@ -202,20 +192,18 @@ class NPUPlatform(Platform):
) )
cache_config.block_size = 128 cache_config.block_size = 128
if envs.VLLM_USE_V1: # Activate custom ops for v1, except on 310P
# Activate custom ops for v1, except on 310P if not is_310p():
if not is_310p(): compilation_config.custom_ops = ["all"]
compilation_config.custom_ops = ["all"]
# If ascend_scheduler_config is enabled, # If ascend_scheduler_config is enabled,
# extents original scheduler_config to use AscendScheduler. # extents original scheduler_config to use AscendScheduler.
if ascend_config.ascend_scheduler_config.enabled: if ascend_config.ascend_scheduler_config.enabled:
from vllm_ascend.core.schedule_config import \ from vllm_ascend.core.schedule_config import AscendSchedulerConfig
AscendSchedulerConfig ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config(
ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config( vllm_config.scheduler_config,
vllm_config.scheduler_config, ascend_config.ascend_scheduler_config)
ascend_config.ascend_scheduler_config) vllm_config.scheduler_config = ascend_scheduler_config
vllm_config.scheduler_config = ascend_scheduler_config
@classmethod @classmethod
def get_attn_backend_cls(cls, selected_backend, head_size, dtype, def get_attn_backend_cls(cls, selected_backend, head_size, dtype,