[Misc] Remove VLLM_USE_V1 usage in code (#1764)

We plan to remove V0 code from this version. The first step is to delete
v0 usage.

Related: https://github.com/vllm-project/vllm-ascend/issues/1620

- vLLM version: v0.9.2
- vLLM main:
61e20828da

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-07-15 11:52:16 +08:00
committed by GitHub
parent 494b0f474f
commit 7bdada58eb
6 changed files with 100 additions and 217 deletions

View File

@@ -15,7 +15,6 @@
# limitations under the License.
from typing import Optional
import vllm.envs as envs
from vllm.logger import logger
TORCHAIR_MODEL_LIST = ["deepseek", "pangu"]
@@ -126,46 +125,36 @@ def get_ascend_config():
def check_ascend_config(vllm_config, enforce_eager):
ascend_config = get_ascend_config()
# for v0 engine
if not envs.VLLM_USE_V1:
# for eager mode
if enforce_eager:
# torchair_graph cannot be enabled with eager mode.
if ascend_config.torchair_graph_config.enabled:
raise NotImplementedError(
"Torchair graph mode is only supported for V1 Engine.")
if ascend_config.ascend_scheduler_config.enabled:
raise NotImplementedError(
"Ascend scheduler is only supported for V1 Engine.")
# for v1 engine
raise RuntimeError(
"Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode."
)
# for graph mode
else:
# for eager mode
if enforce_eager:
# torchair_graph cannot be enabled with eager mode.
if ascend_config.torchair_graph_config.enabled:
raise RuntimeError(
"Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode."
)
# for graph mode
# torchair_graph case
if ascend_config.torchair_graph_config.enabled:
# torchair_graph is supported for deepseek/pangu model only.
if vllm_config.model_config:
model_type = vllm_config.model_config.hf_config.model_type
if not _check_torchair_supported(model_type):
raise NotImplementedError(
"Torchair graph mode only works with following model types:"
f"{TORCHAIR_MODEL_LIST}.")
# aclgraph case
else:
# torchair_graph case
if ascend_config.torchair_graph_config.enabled:
# torchair_graph is supported for deepseek/pangu model only.
if vllm_config.model_config:
model_type = vllm_config.model_config.hf_config.model_type
if not _check_torchair_supported(model_type):
raise NotImplementedError(
"Torchair graph mode only works with following model types:"
f"{TORCHAIR_MODEL_LIST}.")
# aclgraph case
else:
# aclgraph doesn't work with deepseek model and only qwen model is well tested.
if vllm_config.model_config:
model_type = vllm_config.model_config.hf_config.model_type
if "deepseek" in model_type:
raise NotImplementedError(
"ACL Graph does not support deepseek. Please "
"try torchair graph mode to serve deepseek models on vllm-ascend."
" Or set `enforce_eager=True` to use eager mode.")
if "qwen" not in model_type:
logger.warning(
"ACL Graph is currently experimental. Please "
"raise an issue on https://github.com/vllm-project/vllm-ascend/issues"
" if you encourage any Error")
# aclgraph doesn't work with deepseek model and only qwen model is well tested.
if vllm_config.model_config:
model_type = vllm_config.model_config.hf_config.model_type
if "deepseek" in model_type:
raise NotImplementedError(
"ACL Graph does not support deepseek. Please "
"try torchair graph mode to serve deepseek models on vllm-ascend."
" Or set `enforce_eager=True` to use eager mode.")
if "qwen" not in model_type:
logger.warning(
"ACL Graph is currently experimental. Please "
"raise an issue on https://github.com/vllm-project/vllm-ascend/issues"
" if you encourage any Error")

View File

@@ -30,7 +30,6 @@ from typing import Any, Dict, Iterable, List, Optional, Union
import torch
import torch.distributed as dist
import torch_npu # noqa: F401
import vllm.envs as envs
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import Attention, AttentionMetadata
@@ -397,20 +396,17 @@ class CustomDeepseekDBOMLAAttention(DeepseekV2MLAAttention):
hidden_states_or_q_c = hidden_states
if self.torchair_graph_enabled:
forward_kwargs = {}
if envs.VLLM_USE_V1:
output_shape = hidden_states.shape
output = torch.empty(output_shape,
dtype=hidden_states_or_q_c.dtype,
device=hidden_states_or_q_c.device)
forward_kwargs['output'] = output
output_shape = hidden_states.shape
output = torch.empty(output_shape,
dtype=hidden_states_or_q_c.dtype,
device=hidden_states_or_q_c.device)
forward_kwargs['output'] = output
output = self.mla_attn.impl.forward(self.mla_attn,
hidden_states_or_q_c,
hidden_states, None, kv_cache,
attn_metadata,
**forward_kwargs)
if envs.VLLM_USE_V1:
output = output.view(-1, output_shape[-1])
output = output.view(-1, output_shape[-1])
return output
else:
kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
@@ -885,7 +881,7 @@ class CustomDeepseekDBOModel(nn.Module):
def can_run_ms(self):
attn_metadata = get_forward_context().attn_metadata
# support mla attention and V1 engine at present
if not self.use_mla or not envs.VLLM_USE_V1:
if not self.use_mla:
return False
# enable prefill overlap
if attn_metadata is None or attn_metadata.num_prefills == 0:

View File

@@ -29,7 +29,6 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
import torch
import torch_npu
import vllm.envs as envs
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import Attention, AttentionMetadata
@@ -579,20 +578,17 @@ class CustomDeepseekV2MLAAttention(DeepseekV2MLAAttention):
else:
hidden_states_or_q_c = hidden_states
if self.torchair_graph_enabled:
if envs.VLLM_USE_V1:
output_shape = hidden_states.shape
output = torch.empty(output_shape,
dtype=hidden_states_or_q_c.dtype,
device=hidden_states_or_q_c.device)
forward_kwargs['output'] = output
output_shape = hidden_states.shape
output = torch.empty(output_shape,
dtype=hidden_states_or_q_c.dtype,
device=hidden_states_or_q_c.device)
forward_kwargs['output'] = output
output = self.mla_attn.impl.forward(self.mla_attn,
hidden_states_or_q_c,
hidden_states, None, kv_cache,
attn_metadata,
**forward_kwargs)
if envs.VLLM_USE_V1:
output = output.view(-1, output_shape[-1])
output = output.view(-1, output_shape[-1])
return output
else:
kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
@@ -660,7 +656,7 @@ class CustomDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):
prefix=f"{prefix}.mlp",
)
self.mla_moe_communication = ascend_config.torchair_graph_config.enable_multistream_moe \
and model_config.use_mla and envs.VLLM_USE_V1 and self.tp_size > 1
and model_config.use_mla and self.tp_size > 1
else:
self.mlp = CustomDeepseekV2MLP(
hidden_size=config.hidden_size,

View File

@@ -16,7 +16,6 @@
#
import gc
import os
from datetime import timedelta
from typing import TYPE_CHECKING, Optional, Tuple
@@ -117,6 +116,8 @@ class NPUPlatform(Platform):
@classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if not envs.VLLM_USE_V1:
raise ValueError("vLLM Ascend does not support V0 engine")
# initialize ascend config from vllm additional_config
ascend_config = init_ascend_config(vllm_config)
@@ -180,18 +181,7 @@ class NPUPlatform(Platform):
update_aclgraph_sizes(vllm_config)
if parallel_config and parallel_config.worker_cls == "auto":
if envs.VLLM_USE_V1:
parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
elif vllm_config.speculative_config:
# NOTE: We set this var to `1` in vllm-ascend to avoid segment
# fault when using spec decode with V0 engine.
os.environ["ACL_OP_INIT_MODE"] = "1"
parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker"
parallel_config.sd_worker_cls = "vllm_ascend.worker.worker.NPUWorker"
elif vllm_config.scheduler_config.is_multi_step:
parallel_config.worker_cls = "vllm_ascend.worker.multi_step_worker.MultiStepWorker"
else:
parallel_config.worker_cls = "vllm_ascend.worker.worker.NPUWorker"
parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
if cache_config:
if cache_config.block_size is None:
@@ -202,20 +192,18 @@ class NPUPlatform(Platform):
)
cache_config.block_size = 128
if envs.VLLM_USE_V1:
# Activate custom ops for v1, except on 310P
if not is_310p():
compilation_config.custom_ops = ["all"]
# Activate custom ops for v1, except on 310P
if not is_310p():
compilation_config.custom_ops = ["all"]
# If ascend_scheduler_config is enabled,
# extents original scheduler_config to use AscendScheduler.
if ascend_config.ascend_scheduler_config.enabled:
from vllm_ascend.core.schedule_config import \
AscendSchedulerConfig
ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config(
vllm_config.scheduler_config,
ascend_config.ascend_scheduler_config)
vllm_config.scheduler_config = ascend_scheduler_config
# If ascend_scheduler_config is enabled,
# extents original scheduler_config to use AscendScheduler.
if ascend_config.ascend_scheduler_config.enabled:
from vllm_ascend.core.schedule_config import AscendSchedulerConfig
ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config(
vllm_config.scheduler_config,
ascend_config.ascend_scheduler_config)
vllm_config.scheduler_config = ascend_scheduler_config
@classmethod
def get_attn_backend_cls(cls, selected_backend, head_size, dtype,