Remove VLLM_USE_V1 (#4086)
Drop VLLM_USE_V1 usage. This env has been removed from vLLM already.
- vLLM version: v0.11.0
- vLLM main:
83f478bb19
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
5
.github/workflows/nightly_benchmarks.yaml
vendored
5
.github/workflows/nightly_benchmarks.yaml
vendored
@@ -46,14 +46,13 @@ jobs:
|
||||
test:
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'performance-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
|
||||
|
||||
name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }}, use_v1=${{ matrix.vllm_use_v1 }}
|
||||
name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }}
|
||||
runs-on: 'linux-arm64-npu-static-8'
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- vllm_branch: v0.11.0
|
||||
vllm_ascend_branch: main
|
||||
vllm_use_v1: 1
|
||||
max-parallel: 1
|
||||
container:
|
||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
|
||||
@@ -73,7 +72,6 @@ jobs:
|
||||
VLLM_USE_MODELSCOPE: True
|
||||
ES_OM_DOMAIN: ${{ secrets.ES_OM_DOMAIN }}
|
||||
ES_OM_AUTHORIZATION: ${{ secrets.ES_OM_AUTHORIZATION }}
|
||||
VLLM_USE_V1: ${{ matrix.vllm_use_v1 }}
|
||||
steps:
|
||||
- name: Check npu and CANN info
|
||||
run: |
|
||||
@@ -200,7 +198,6 @@ jobs:
|
||||
--created_at "$commit_time_no_tz" \
|
||||
--res_dir ./benchmarks/results \
|
||||
--error "$ERROR_MSG" \
|
||||
--extra_feat '{"VLLM_USE_V1": "${{ matrix.vllm_use_v1 }}"}'
|
||||
rm -rf ./benchmarks/results
|
||||
cd -
|
||||
done < commit_log.txt
|
||||
|
||||
@@ -160,7 +160,6 @@ export TP_SOCKET_IFNAME=$nic_name
|
||||
export HCCL_SOCKET_IFNAME=$nic_name
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=1024
|
||||
|
||||
vllm serve vllm-ascend/DeepSeek-V3.1-W8A8 \
|
||||
|
||||
@@ -70,7 +70,6 @@ export TP_SOCKET_IFNAME=$nic_name
|
||||
export HCCL_SOCKET_IFNAME=$nic_name
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=1024
|
||||
|
||||
# The w8a8 weight can be obtained from https://www.modelscope.cn/models/vllm-ascend/Kimi-K2-Instruct-W8A8
|
||||
@@ -116,7 +115,6 @@ export TP_SOCKET_IFNAME=$nic_name
|
||||
export HCCL_SOCKET_IFNAME=$nic_name
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=1024
|
||||
|
||||
vllm serve /home/cache/weights/Kimi-K2-Instruct-W8A8 \
|
||||
|
||||
@@ -104,7 +104,6 @@ export HCCL_SOCKET_IFNAME="eth0"
|
||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=10
|
||||
export VLLM_USE_V1=1
|
||||
|
||||
vllm serve /model/Qwen3-30B-A3B \
|
||||
--host 0.0.0.0 \
|
||||
@@ -144,7 +143,6 @@ export HCCL_SOCKET_IFNAME="eth0"
|
||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=10
|
||||
export VLLM_USE_V1=1
|
||||
|
||||
vllm serve /model/Qwen3-30B-A3B \
|
||||
--host 0.0.0.0 \
|
||||
@@ -183,7 +181,6 @@ export HCCL_SOCKET_IFNAME="eth0"
|
||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=10
|
||||
export VLLM_USE_V1=1
|
||||
|
||||
vllm serve /model/Qwen3-30B-A3B \
|
||||
--host 0.0.0.0 \
|
||||
|
||||
@@ -107,7 +107,6 @@ export HCCL_IF_IP=192.0.0.1
|
||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||
export TP_SOCKET_IFNAME="eth0"
|
||||
export HCCL_SOCKET_IFNAME="eth0"
|
||||
export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=1024
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=10
|
||||
@@ -162,7 +161,6 @@ export HCCL_IF_IP=192.0.0.2
|
||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||
export TP_SOCKET_IFNAME="eth0"
|
||||
export HCCL_SOCKET_IFNAME="eth0"
|
||||
export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=1024
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=10
|
||||
@@ -217,7 +215,6 @@ export HCCL_IF_IP=192.0.0.3
|
||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||
export TP_SOCKET_IFNAME="eth0"
|
||||
export HCCL_SOCKET_IFNAME="eth0"
|
||||
export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=2048
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=10
|
||||
@@ -274,7 +271,6 @@ export HCCL_IF_IP=192.0.0.4
|
||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||
export TP_SOCKET_IFNAME="eth0"
|
||||
export HCCL_SOCKET_IFNAME="eth0"
|
||||
export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=2048
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=10
|
||||
@@ -338,7 +334,6 @@ export HCCL_IF_IP=192.0.0.1
|
||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||
export TP_SOCKET_IFNAME="eth0"
|
||||
export HCCL_SOCKET_IFNAME="eth0"
|
||||
export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=1024
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=10
|
||||
@@ -393,7 +388,6 @@ export HCCL_IF_IP=192.0.0.2
|
||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||
export TP_SOCKET_IFNAME="eth0"
|
||||
export HCCL_SOCKET_IFNAME="eth0"
|
||||
export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=1024
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=10
|
||||
@@ -448,7 +442,6 @@ export HCCL_IF_IP=192.0.0.3
|
||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||
export TP_SOCKET_IFNAME="eth0"
|
||||
export HCCL_SOCKET_IFNAME="eth0"
|
||||
export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=2048
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=10
|
||||
@@ -505,7 +498,6 @@ export HCCL_IF_IP=192.0.0.4
|
||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||
export TP_SOCKET_IFNAME="eth0"
|
||||
export HCCL_SOCKET_IFNAME="eth0"
|
||||
export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=2048
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=10
|
||||
|
||||
@@ -70,7 +70,6 @@ export TP_SOCKET_IFNAME=$nic_name
|
||||
export HCCL_SOCKET_IFNAME=$nic_name
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=1024
|
||||
|
||||
vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \
|
||||
@@ -112,7 +111,6 @@ export TP_SOCKET_IFNAME=$nic_name
|
||||
export HCCL_SOCKET_IFNAME=$nic_name
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=1024
|
||||
|
||||
vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \
|
||||
|
||||
@@ -82,7 +82,6 @@ The content of the multi_producer.sh script:
|
||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
|
||||
export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json"
|
||||
export VLLM_USE_V1=1
|
||||
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
|
||||
export ACL_OP_INIT_MODE=1
|
||||
export ASCEND_BUFFER_POOL=4:8
|
||||
@@ -145,7 +144,6 @@ The content of multi_consumer.sh:
|
||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
|
||||
export MOONCAKE_CONFIG_PATH="/xxxxx/mooncake.json"
|
||||
export VLLM_USE_V1=1
|
||||
export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7
|
||||
export ACL_OP_INIT_MODE=1
|
||||
export ASCEND_BUFFER_POOL=4:8
|
||||
@@ -246,7 +244,6 @@ Content of mixed_department.sh:
|
||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
|
||||
export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json"
|
||||
export VLLM_USE_V1=1
|
||||
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
|
||||
export ACL_OP_INIT_MODE=1
|
||||
export ASCEND_BUFFER_POOL=4:8
|
||||
|
||||
@@ -41,7 +41,6 @@ export HCCL_SOCKET_IFNAME="eth0"
|
||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5559
|
||||
|
||||
vllm serve /models/deepseek_r1_w8a8 \
|
||||
@@ -82,7 +81,6 @@ export HCCL_SOCKET_IFNAME="eth0"
|
||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5659
|
||||
|
||||
vllm serve /models/deepseek_r1_w8a8 \
|
||||
@@ -126,7 +124,6 @@ export HCCL_SOCKET_IFNAME="eth0"
|
||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5759
|
||||
|
||||
vllm serve /models/deepseek_r1_w8a8 \
|
||||
@@ -168,7 +165,6 @@ export HCCL_SOCKET_IFNAME="eth0"
|
||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5859
|
||||
|
||||
vllm serve /models/deepseek_r1_w8a8 \
|
||||
|
||||
@@ -7,8 +7,6 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=path-to-rank-table
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
|
||||
export VLLM_USE_V1=1
|
||||
|
||||
vllm serve model_path \
|
||||
--host 0.0.0.0 \
|
||||
--port 20002 \
|
||||
|
||||
@@ -11,8 +11,6 @@ export HCCL_DETERMINISTIC=True
|
||||
export HCCL_BUFFSIZE=1024
|
||||
export TASK_QUEUE_ENABLE=1
|
||||
|
||||
export VLLM_USE_V1=1
|
||||
|
||||
export ASCEND_RT_VISIBLE_DEVICES=$1
|
||||
|
||||
vllm serve model_path \
|
||||
|
||||
@@ -7,7 +7,6 @@ export HCCL_SOCKET_IFNAME="eth0"
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
|
||||
export ASCEND_LAUNCH_BLOCKING=0
|
||||
|
||||
@@ -72,7 +72,6 @@ async def test_models(model: str) -> None:
|
||||
"OMP_PROC_BIND": "false",
|
||||
"HCCL_BUFFSIZE": "1024",
|
||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
||||
"VLLM_USE_V1": "1"
|
||||
}
|
||||
additional_config = {
|
||||
"ascend_scheduler_config": {
|
||||
|
||||
@@ -73,7 +73,6 @@ async def test_models(model: str, mode: str, tp_size: int) -> None:
|
||||
env_dict = {
|
||||
"TASK_QUEUE_ENABLE": "1",
|
||||
"OMP_PROC_BIND": "false",
|
||||
"VLLM_USE_V1": "1",
|
||||
"HCCL_OP_EXPANSION_MODE": "AIV",
|
||||
"VLLM_ASCEND_ENABLE_FLASHCOMM": "1",
|
||||
"VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1",
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
export LCCL_DETERMINISTIC=1
|
||||
export HCCL_DETERMINISTIC=true
|
||||
export CLOSE_MATMUL_K_SHIFT=1
|
||||
export VLLM_USE_V1=1
|
||||
|
||||
set -xe
|
||||
|
||||
|
||||
@@ -30,8 +30,6 @@ from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||
MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
|
||||
reason="aclgraph only support on v1")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": "0"})
|
||||
|
||||
@@ -27,7 +27,6 @@ else:
|
||||
from vllm.utils.hashing import sha256
|
||||
|
||||
EOS_TOKEN_ID = 50256
|
||||
os.environ["VLLM_USE_V1"] = "1"
|
||||
|
||||
|
||||
def assert_scheduler_empty(scheduler: Scheduler):
|
||||
|
||||
@@ -9,7 +9,6 @@ import torch
|
||||
from einops import rearrange
|
||||
from torch import nn
|
||||
from transformers.activations import ACT2FN
|
||||
from vllm import envs
|
||||
from vllm.attention import AttentionBackend, AttentionMetadata
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import (CacheConfig, ModelConfig, SpeculativeConfig,
|
||||
@@ -668,7 +667,6 @@ class CustomQwen3NextForCausalLM(Qwen3NextForCausalLM):
|
||||
scheduler_config = vllm_config.scheduler_config
|
||||
assert not cache_config.enable_prefix_caching, \
|
||||
"Qwen3Next currently does not support prefix caching"
|
||||
assert envs.VLLM_USE_V1, "Qwen3Next requires VLLM_USE_V1"
|
||||
self.quant_config = vllm_config.quant_config
|
||||
self.config = config
|
||||
self.scheduler_config = scheduler_config
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import ast
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config.speculative import SpeculativeConfig
|
||||
from vllm.logger import logger
|
||||
|
||||
@@ -163,11 +162,6 @@ def __post_init__(self):
|
||||
|
||||
# Replace hf_config for EAGLE draft_model
|
||||
if self.method in ("eagle", "eagle3"):
|
||||
if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
|
||||
raise ValueError(
|
||||
"Chunked prefill and EAGLE are not compatible "
|
||||
"when using V0.")
|
||||
|
||||
from vllm.transformers_utils.configs import SpeculatorsConfig
|
||||
from vllm.transformers_utils.configs.eagle import EAGLEConfig
|
||||
|
||||
|
||||
@@ -20,7 +20,6 @@ import os
|
||||
from typing import TYPE_CHECKING, Optional, Tuple
|
||||
|
||||
import torch
|
||||
import vllm.envs as envs_vllm
|
||||
from vllm.logger import logger
|
||||
from vllm.platforms import Platform, PlatformEnum
|
||||
|
||||
@@ -117,8 +116,6 @@ class NPUPlatform(Platform):
|
||||
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||
if not envs_vllm.VLLM_USE_V1:
|
||||
raise ValueError("vLLM Ascend does not support V0 engine.")
|
||||
# initialize ascend config from vllm additional_config
|
||||
ascend_config = init_ascend_config(vllm_config)
|
||||
|
||||
|
||||
@@ -21,7 +21,6 @@ from typing import Any, List, Optional, Union
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import vllm
|
||||
import vllm.envs as envs
|
||||
from torch import nn
|
||||
from transformers import Qwen2Config
|
||||
from vllm.attention import AttentionMetadata, AttentionType
|
||||
@@ -112,12 +111,9 @@ class CustomQwen2Attention(Qwen2Attention):
|
||||
is_prefill=False,
|
||||
is_qwen_torchair=True)
|
||||
forward_kwargs = {}
|
||||
if envs.VLLM_USE_V1:
|
||||
output_shape = q.shape
|
||||
output = torch.empty(output_shape,
|
||||
dtype=q.dtype,
|
||||
device=q.device)
|
||||
forward_kwargs['output'] = output
|
||||
output_shape = q.shape
|
||||
output = torch.empty(output_shape, dtype=q.dtype, device=q.device)
|
||||
forward_kwargs['output'] = output
|
||||
|
||||
attn_output = self.attn.impl.forward(self.attn,
|
||||
q,
|
||||
|
||||
@@ -19,7 +19,6 @@
|
||||
from typing import Any, List, Optional, Union
|
||||
|
||||
import torch
|
||||
import vllm.envs as envs
|
||||
from torch import nn
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.attention import Attention, AttentionMetadata
|
||||
@@ -244,12 +243,9 @@ class CustomQwen3MoeAttention(Qwen3MoeAttention):
|
||||
is_prefill=False,
|
||||
is_qwen_torchair=True)
|
||||
forward_kwargs = {}
|
||||
if envs.VLLM_USE_V1:
|
||||
output_shape = q.shape
|
||||
output = torch.empty(output_shape,
|
||||
dtype=q.dtype,
|
||||
device=q.device)
|
||||
forward_kwargs['output'] = output
|
||||
output_shape = q.shape
|
||||
output = torch.empty(output_shape, dtype=q.dtype, device=q.device)
|
||||
forward_kwargs['output'] = output
|
||||
|
||||
attn_output = self.attn.impl.forward(self.attn,
|
||||
q,
|
||||
|
||||
Reference in New Issue
Block a user