[Misc] Remove VLLM_USE_V1 usage in code (#1764)

We plan to remove V0 code from this version. The first step is to delete
v0 usage.

Related: https://github.com/vllm-project/vllm-ascend/issues/1620

- vLLM version: v0.9.2
- vLLM main:
61e20828da

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-07-15 11:52:16 +08:00
committed by GitHub
parent 494b0f474f
commit 7bdada58eb
6 changed files with 100 additions and 217 deletions

View File

@@ -193,28 +193,7 @@ class TestAscendConfig(TestBase):
@_clean_up_ascend_config @_clean_up_ascend_config
def test_check_ascend_config_wrong_case(self): def test_check_ascend_config_wrong_case(self):
test_vllm_config = VllmConfig() test_vllm_config = VllmConfig()
# For V0 engine
with mock.patch.dict(os.environ, {"VLLM_USE_V1": "0"}):
with self.assertRaises(NotImplementedError):
test_vllm_config.additional_config = {
"torchair_graph_config": {
"enabled": True,
},
"refresh": True
}
init_ascend_config(test_vllm_config)
check_ascend_config(test_vllm_config, False)
with self.assertRaises(NotImplementedError):
test_vllm_config.additional_config = {
"ascend_scheduler_config": {
"enabled": True,
},
"refresh": True
}
init_ascend_config(test_vllm_config)
check_ascend_config(test_vllm_config, True)
# For V1 engine
with mock.patch.dict(os.environ, {"VLLM_USE_V1": "1"}):
# torchair + eager mode # torchair + eager mode
with self.assertRaises(RuntimeError): with self.assertRaises(RuntimeError):
test_vllm_config.additional_config = { test_vllm_config.additional_config = {
@@ -234,8 +213,7 @@ class TestAscendConfig(TestBase):
}, },
"refresh": True "refresh": True
} }
model_path = os.path.join(os.path.dirname(__file__), model_path = os.path.join(os.path.dirname(__file__), "fake_weight")
"fake_weight")
fake_model_config = ModelConfig(model=model_path) fake_model_config = ModelConfig(model=model_path)
fake_model_config.hf_config = PretrainedConfig() fake_model_config.hf_config = PretrainedConfig()
fake_model_config.hf_config.model_type = "llama" fake_model_config.hf_config.model_type = "llama"
@@ -250,8 +228,7 @@ class TestAscendConfig(TestBase):
}, },
"refresh": True "refresh": True
} }
model_path = os.path.join(os.path.dirname(__file__), model_path = os.path.join(os.path.dirname(__file__), "fake_weight")
"fake_weight")
fake_model_config = ModelConfig(model=model_path) fake_model_config = ModelConfig(model=model_path)
fake_model_config.hf_config = PretrainedConfig() fake_model_config.hf_config = PretrainedConfig()
fake_model_config.hf_config.model_type = "deepseek" fake_model_config.hf_config.model_type = "deepseek"

View File

@@ -389,69 +389,6 @@ class TestNPUPlatform(TestBase):
"vllm_ascend.worker.worker_v1.NPUWorker", "vllm_ascend.worker.worker_v1.NPUWorker",
) )
@patch("vllm_ascend.ascend_config.check_ascend_config")
@patch("vllm_ascend.ascend_config.init_ascend_config")
@patch("vllm.envs.VLLM_USE_V1", False)
def test_check_and_update_config_speculative_worker_config(
self, mock_init_ascend, mock_check_ascend):
mock_init_ascend.return_value = self.mock_ascend_config
self.mock_vllm_config.speculative_config = MagicMock()
self.mock_vllm_config.speculative_config.disable_logprobs = True
self.mock_vllm_config.parallel_config.worker_cls = "auto"
with patch.dict("os.environ", {}):
from vllm_ascend import platform
importlib.reload(platform)
self.platform.check_and_update_config(self.mock_vllm_config)
import os
self.assertEqual(os.environ.get("ACL_OP_INIT_MODE"), "1")
self.assertEqual(
self.mock_vllm_config.parallel_config.worker_cls,
"vllm.spec_decode.spec_decode_worker.create_spec_worker",
)
self.assertEqual(
self.mock_vllm_config.parallel_config.sd_worker_cls,
"vllm_ascend.worker.worker.NPUWorker",
)
@patch("vllm_ascend.ascend_config.check_ascend_config")
@patch("vllm_ascend.ascend_config.init_ascend_config")
@patch("vllm.envs.VLLM_USE_V1", False)
def test_check_and_update_config_multi_step_worker_config(
self, mock_init_ascend, mock_check_ascend):
mock_init_ascend.return_value = self.mock_ascend_config
self.mock_vllm_config.scheduler_config.is_multi_step = True
self.mock_vllm_config.parallel_config.worker_cls = "auto"
from vllm_ascend import platform
importlib.reload(platform)
self.platform.check_and_update_config(self.mock_vllm_config)
self.assertEqual(
self.mock_vllm_config.parallel_config.worker_cls,
"vllm_ascend.worker.multi_step_worker.MultiStepWorker",
)
@patch("vllm_ascend.ascend_config.check_ascend_config")
@patch("vllm_ascend.ascend_config.init_ascend_config")
@patch("vllm.envs.VLLM_USE_V1", False)
def test_check_and_update_config_default_worker_config(
self, mock_init_ascend, mock_check_ascend):
mock_init_ascend.return_value = self.mock_ascend_config
self.mock_vllm_config.parallel_config.worker_cls = "auto"
self.mock_vllm_config.scheduler_config.is_multi_step = False
from vllm_ascend import platform
importlib.reload(platform)
self.platform.check_and_update_config(self.mock_vllm_config)
self.assertEqual(
self.mock_vllm_config.parallel_config.worker_cls,
"vllm_ascend.worker.worker.NPUWorker",
)
@patch("vllm_ascend.ascend_config.check_ascend_config") @patch("vllm_ascend.ascend_config.check_ascend_config")
@patch("vllm_ascend.ascend_config.init_ascend_config") @patch("vllm_ascend.ascend_config.init_ascend_config")
@patch("vllm_ascend.utils.is_310p", return_value=True) @patch("vllm_ascend.utils.is_310p", return_value=True)

View File

@@ -15,7 +15,6 @@
# limitations under the License. # limitations under the License.
from typing import Optional from typing import Optional
import vllm.envs as envs
from vllm.logger import logger from vllm.logger import logger
TORCHAIR_MODEL_LIST = ["deepseek", "pangu"] TORCHAIR_MODEL_LIST = ["deepseek", "pangu"]
@@ -126,16 +125,6 @@ def get_ascend_config():
def check_ascend_config(vllm_config, enforce_eager): def check_ascend_config(vllm_config, enforce_eager):
ascend_config = get_ascend_config() ascend_config = get_ascend_config()
# for v0 engine
if not envs.VLLM_USE_V1:
if ascend_config.torchair_graph_config.enabled:
raise NotImplementedError(
"Torchair graph mode is only supported for V1 Engine.")
if ascend_config.ascend_scheduler_config.enabled:
raise NotImplementedError(
"Ascend scheduler is only supported for V1 Engine.")
# for v1 engine
else:
# for eager mode # for eager mode
if enforce_eager: if enforce_eager:
# torchair_graph cannot be enabled with eager mode. # torchair_graph cannot be enabled with eager mode.

View File

@@ -30,7 +30,6 @@ from typing import Any, Dict, Iterable, List, Optional, Union
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import torch_npu # noqa: F401 import torch_npu # noqa: F401
import vllm.envs as envs
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.attention import Attention, AttentionMetadata from vllm.attention import Attention, AttentionMetadata
@@ -397,19 +396,16 @@ class CustomDeepseekDBOMLAAttention(DeepseekV2MLAAttention):
hidden_states_or_q_c = hidden_states hidden_states_or_q_c = hidden_states
if self.torchair_graph_enabled: if self.torchair_graph_enabled:
forward_kwargs = {} forward_kwargs = {}
if envs.VLLM_USE_V1:
output_shape = hidden_states.shape output_shape = hidden_states.shape
output = torch.empty(output_shape, output = torch.empty(output_shape,
dtype=hidden_states_or_q_c.dtype, dtype=hidden_states_or_q_c.dtype,
device=hidden_states_or_q_c.device) device=hidden_states_or_q_c.device)
forward_kwargs['output'] = output forward_kwargs['output'] = output
output = self.mla_attn.impl.forward(self.mla_attn, output = self.mla_attn.impl.forward(self.mla_attn,
hidden_states_or_q_c, hidden_states_or_q_c,
hidden_states, None, kv_cache, hidden_states, None, kv_cache,
attn_metadata, attn_metadata,
**forward_kwargs) **forward_kwargs)
if envs.VLLM_USE_V1:
output = output.view(-1, output_shape[-1]) output = output.view(-1, output_shape[-1])
return output return output
else: else:
@@ -885,7 +881,7 @@ class CustomDeepseekDBOModel(nn.Module):
def can_run_ms(self): def can_run_ms(self):
attn_metadata = get_forward_context().attn_metadata attn_metadata = get_forward_context().attn_metadata
# support mla attention and V1 engine at present # support mla attention and V1 engine at present
if not self.use_mla or not envs.VLLM_USE_V1: if not self.use_mla:
return False return False
# enable prefill overlap # enable prefill overlap
if attn_metadata is None or attn_metadata.num_prefills == 0: if attn_metadata is None or attn_metadata.num_prefills == 0:

View File

@@ -29,7 +29,6 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
import torch import torch
import torch_npu import torch_npu
import vllm.envs as envs
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.attention import Attention, AttentionMetadata from vllm.attention import Attention, AttentionMetadata
@@ -579,19 +578,16 @@ class CustomDeepseekV2MLAAttention(DeepseekV2MLAAttention):
else: else:
hidden_states_or_q_c = hidden_states hidden_states_or_q_c = hidden_states
if self.torchair_graph_enabled: if self.torchair_graph_enabled:
if envs.VLLM_USE_V1:
output_shape = hidden_states.shape output_shape = hidden_states.shape
output = torch.empty(output_shape, output = torch.empty(output_shape,
dtype=hidden_states_or_q_c.dtype, dtype=hidden_states_or_q_c.dtype,
device=hidden_states_or_q_c.device) device=hidden_states_or_q_c.device)
forward_kwargs['output'] = output forward_kwargs['output'] = output
output = self.mla_attn.impl.forward(self.mla_attn, output = self.mla_attn.impl.forward(self.mla_attn,
hidden_states_or_q_c, hidden_states_or_q_c,
hidden_states, None, kv_cache, hidden_states, None, kv_cache,
attn_metadata, attn_metadata,
**forward_kwargs) **forward_kwargs)
if envs.VLLM_USE_V1:
output = output.view(-1, output_shape[-1]) output = output.view(-1, output_shape[-1])
return output return output
else: else:
@@ -660,7 +656,7 @@ class CustomDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):
prefix=f"{prefix}.mlp", prefix=f"{prefix}.mlp",
) )
self.mla_moe_communication = ascend_config.torchair_graph_config.enable_multistream_moe \ self.mla_moe_communication = ascend_config.torchair_graph_config.enable_multistream_moe \
and model_config.use_mla and envs.VLLM_USE_V1 and self.tp_size > 1 and model_config.use_mla and self.tp_size > 1
else: else:
self.mlp = CustomDeepseekV2MLP( self.mlp = CustomDeepseekV2MLP(
hidden_size=config.hidden_size, hidden_size=config.hidden_size,

View File

@@ -16,7 +16,6 @@
# #
import gc import gc
import os
from datetime import timedelta from datetime import timedelta
from typing import TYPE_CHECKING, Optional, Tuple from typing import TYPE_CHECKING, Optional, Tuple
@@ -117,6 +116,8 @@ class NPUPlatform(Platform):
@classmethod @classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None: def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if not envs.VLLM_USE_V1:
raise ValueError("vLLM Ascend does not support V0 engine")
# initialize ascend config from vllm additional_config # initialize ascend config from vllm additional_config
ascend_config = init_ascend_config(vllm_config) ascend_config = init_ascend_config(vllm_config)
@@ -180,18 +181,7 @@ class NPUPlatform(Platform):
update_aclgraph_sizes(vllm_config) update_aclgraph_sizes(vllm_config)
if parallel_config and parallel_config.worker_cls == "auto": if parallel_config and parallel_config.worker_cls == "auto":
if envs.VLLM_USE_V1:
parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker" parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
elif vllm_config.speculative_config:
# NOTE: We set this var to `1` in vllm-ascend to avoid segment
# fault when using spec decode with V0 engine.
os.environ["ACL_OP_INIT_MODE"] = "1"
parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker"
parallel_config.sd_worker_cls = "vllm_ascend.worker.worker.NPUWorker"
elif vllm_config.scheduler_config.is_multi_step:
parallel_config.worker_cls = "vllm_ascend.worker.multi_step_worker.MultiStepWorker"
else:
parallel_config.worker_cls = "vllm_ascend.worker.worker.NPUWorker"
if cache_config: if cache_config:
if cache_config.block_size is None: if cache_config.block_size is None:
@@ -202,7 +192,6 @@ class NPUPlatform(Platform):
) )
cache_config.block_size = 128 cache_config.block_size = 128
if envs.VLLM_USE_V1:
# Activate custom ops for v1, except on 310P # Activate custom ops for v1, except on 310P
if not is_310p(): if not is_310p():
compilation_config.custom_ops = ["all"] compilation_config.custom_ops = ["all"]
@@ -210,8 +199,7 @@ class NPUPlatform(Platform):
# If ascend_scheduler_config is enabled, # If ascend_scheduler_config is enabled,
# extents original scheduler_config to use AscendScheduler. # extents original scheduler_config to use AscendScheduler.
if ascend_config.ascend_scheduler_config.enabled: if ascend_config.ascend_scheduler_config.enabled:
from vllm_ascend.core.schedule_config import \ from vllm_ascend.core.schedule_config import AscendSchedulerConfig
AscendSchedulerConfig
ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config( ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config(
vllm_config.scheduler_config, vllm_config.scheduler_config,
ascend_config.ascend_scheduler_config) ascend_config.ascend_scheduler_config)