Remove VLLM_USE_V1 (#4086)

Drop VLLM_USE_V1 usage.  This env has been removed from vLLM already.

- vLLM version: v0.11.0
- vLLM main:
83f478bb19

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-11-11 15:43:39 +08:00
committed by GitHub
parent d5567680a2
commit f811a24bf0
21 changed files with 7 additions and 63 deletions

View File

@@ -46,14 +46,13 @@ jobs:
test: test:
if: ${{ contains(github.event.pull_request.labels.*.name, 'performance-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} if: ${{ contains(github.event.pull_request.labels.*.name, 'performance-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }}, use_v1=${{ matrix.vllm_use_v1 }} name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }}
runs-on: 'linux-arm64-npu-static-8' runs-on: 'linux-arm64-npu-static-8'
strategy: strategy:
matrix: matrix:
include: include:
- vllm_branch: v0.11.0 - vllm_branch: v0.11.0
vllm_ascend_branch: main vllm_ascend_branch: main
vllm_use_v1: 1
max-parallel: 1 max-parallel: 1
container: container:
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11 image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
@@ -73,7 +72,6 @@ jobs:
VLLM_USE_MODELSCOPE: True VLLM_USE_MODELSCOPE: True
ES_OM_DOMAIN: ${{ secrets.ES_OM_DOMAIN }} ES_OM_DOMAIN: ${{ secrets.ES_OM_DOMAIN }}
ES_OM_AUTHORIZATION: ${{ secrets.ES_OM_AUTHORIZATION }} ES_OM_AUTHORIZATION: ${{ secrets.ES_OM_AUTHORIZATION }}
VLLM_USE_V1: ${{ matrix.vllm_use_v1 }}
steps: steps:
- name: Check npu and CANN info - name: Check npu and CANN info
run: | run: |
@@ -200,7 +198,6 @@ jobs:
--created_at "$commit_time_no_tz" \ --created_at "$commit_time_no_tz" \
--res_dir ./benchmarks/results \ --res_dir ./benchmarks/results \
--error "$ERROR_MSG" \ --error "$ERROR_MSG" \
--extra_feat '{"VLLM_USE_V1": "${{ matrix.vllm_use_v1 }}"}'
rm -rf ./benchmarks/results rm -rf ./benchmarks/results
cd - cd -
done < commit_log.txt done < commit_log.txt

View File

@@ -160,7 +160,6 @@ export TP_SOCKET_IFNAME=$nic_name
export HCCL_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100 export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=1024 export HCCL_BUFFSIZE=1024
vllm serve vllm-ascend/DeepSeek-V3.1-W8A8 \ vllm serve vllm-ascend/DeepSeek-V3.1-W8A8 \

View File

@@ -70,7 +70,6 @@ export TP_SOCKET_IFNAME=$nic_name
export HCCL_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100 export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=1024 export HCCL_BUFFSIZE=1024
# The w8a8 weight can be obtained from https://www.modelscope.cn/models/vllm-ascend/Kimi-K2-Instruct-W8A8 # The w8a8 weight can be obtained from https://www.modelscope.cn/models/vllm-ascend/Kimi-K2-Instruct-W8A8
@@ -116,7 +115,6 @@ export TP_SOCKET_IFNAME=$nic_name
export HCCL_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100 export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=1024 export HCCL_BUFFSIZE=1024
vllm serve /home/cache/weights/Kimi-K2-Instruct-W8A8 \ vllm serve /home/cache/weights/Kimi-K2-Instruct-W8A8 \

View File

@@ -104,7 +104,6 @@ export HCCL_SOCKET_IFNAME="eth0"
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json" export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10 export OMP_NUM_THREADS=10
export VLLM_USE_V1=1
vllm serve /model/Qwen3-30B-A3B \ vllm serve /model/Qwen3-30B-A3B \
--host 0.0.0.0 \ --host 0.0.0.0 \
@@ -144,7 +143,6 @@ export HCCL_SOCKET_IFNAME="eth0"
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json" export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10 export OMP_NUM_THREADS=10
export VLLM_USE_V1=1
vllm serve /model/Qwen3-30B-A3B \ vllm serve /model/Qwen3-30B-A3B \
--host 0.0.0.0 \ --host 0.0.0.0 \
@@ -183,7 +181,6 @@ export HCCL_SOCKET_IFNAME="eth0"
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json" export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10 export OMP_NUM_THREADS=10
export VLLM_USE_V1=1
vllm serve /model/Qwen3-30B-A3B \ vllm serve /model/Qwen3-30B-A3B \
--host 0.0.0.0 \ --host 0.0.0.0 \

View File

@@ -107,7 +107,6 @@ export HCCL_IF_IP=192.0.0.1
export GLOO_SOCKET_IFNAME="eth0" # network card name export GLOO_SOCKET_IFNAME="eth0" # network card name
export TP_SOCKET_IFNAME="eth0" export TP_SOCKET_IFNAME="eth0"
export HCCL_SOCKET_IFNAME="eth0" export HCCL_SOCKET_IFNAME="eth0"
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=1024 export HCCL_BUFFSIZE=1024
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10 export OMP_NUM_THREADS=10
@@ -162,7 +161,6 @@ export HCCL_IF_IP=192.0.0.2
export GLOO_SOCKET_IFNAME="eth0" # network card name export GLOO_SOCKET_IFNAME="eth0" # network card name
export TP_SOCKET_IFNAME="eth0" export TP_SOCKET_IFNAME="eth0"
export HCCL_SOCKET_IFNAME="eth0" export HCCL_SOCKET_IFNAME="eth0"
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=1024 export HCCL_BUFFSIZE=1024
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10 export OMP_NUM_THREADS=10
@@ -217,7 +215,6 @@ export HCCL_IF_IP=192.0.0.3
export GLOO_SOCKET_IFNAME="eth0" # network card name export GLOO_SOCKET_IFNAME="eth0" # network card name
export TP_SOCKET_IFNAME="eth0" export TP_SOCKET_IFNAME="eth0"
export HCCL_SOCKET_IFNAME="eth0" export HCCL_SOCKET_IFNAME="eth0"
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=2048 export HCCL_BUFFSIZE=2048
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10 export OMP_NUM_THREADS=10
@@ -274,7 +271,6 @@ export HCCL_IF_IP=192.0.0.4
export GLOO_SOCKET_IFNAME="eth0" # network card name export GLOO_SOCKET_IFNAME="eth0" # network card name
export TP_SOCKET_IFNAME="eth0" export TP_SOCKET_IFNAME="eth0"
export HCCL_SOCKET_IFNAME="eth0" export HCCL_SOCKET_IFNAME="eth0"
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=2048 export HCCL_BUFFSIZE=2048
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10 export OMP_NUM_THREADS=10
@@ -338,7 +334,6 @@ export HCCL_IF_IP=192.0.0.1
export GLOO_SOCKET_IFNAME="eth0" # network card name export GLOO_SOCKET_IFNAME="eth0" # network card name
export TP_SOCKET_IFNAME="eth0" export TP_SOCKET_IFNAME="eth0"
export HCCL_SOCKET_IFNAME="eth0" export HCCL_SOCKET_IFNAME="eth0"
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=1024 export HCCL_BUFFSIZE=1024
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10 export OMP_NUM_THREADS=10
@@ -393,7 +388,6 @@ export HCCL_IF_IP=192.0.0.2
export GLOO_SOCKET_IFNAME="eth0" # network card name export GLOO_SOCKET_IFNAME="eth0" # network card name
export TP_SOCKET_IFNAME="eth0" export TP_SOCKET_IFNAME="eth0"
export HCCL_SOCKET_IFNAME="eth0" export HCCL_SOCKET_IFNAME="eth0"
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=1024 export HCCL_BUFFSIZE=1024
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10 export OMP_NUM_THREADS=10
@@ -448,7 +442,6 @@ export HCCL_IF_IP=192.0.0.3
export GLOO_SOCKET_IFNAME="eth0" # network card name export GLOO_SOCKET_IFNAME="eth0" # network card name
export TP_SOCKET_IFNAME="eth0" export TP_SOCKET_IFNAME="eth0"
export HCCL_SOCKET_IFNAME="eth0" export HCCL_SOCKET_IFNAME="eth0"
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=2048 export HCCL_BUFFSIZE=2048
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10 export OMP_NUM_THREADS=10
@@ -505,7 +498,6 @@ export HCCL_IF_IP=192.0.0.4
export GLOO_SOCKET_IFNAME="eth0" # network card name export GLOO_SOCKET_IFNAME="eth0" # network card name
export TP_SOCKET_IFNAME="eth0" export TP_SOCKET_IFNAME="eth0"
export HCCL_SOCKET_IFNAME="eth0" export HCCL_SOCKET_IFNAME="eth0"
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=2048 export HCCL_BUFFSIZE=2048
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10 export OMP_NUM_THREADS=10

View File

@@ -70,7 +70,6 @@ export TP_SOCKET_IFNAME=$nic_name
export HCCL_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100 export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=1024 export HCCL_BUFFSIZE=1024
vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \ vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \
@@ -112,7 +111,6 @@ export TP_SOCKET_IFNAME=$nic_name
export HCCL_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100 export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=1024 export HCCL_BUFFSIZE=1024
vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \ vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \

View File

@@ -82,7 +82,6 @@ The content of the multi_producer.sh script:
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json" export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json"
export VLLM_USE_V1=1
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
export ACL_OP_INIT_MODE=1 export ACL_OP_INIT_MODE=1
export ASCEND_BUFFER_POOL=4:8 export ASCEND_BUFFER_POOL=4:8
@@ -145,7 +144,6 @@ The content of multi_consumer.sh:
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
export MOONCAKE_CONFIG_PATH="/xxxxx/mooncake.json" export MOONCAKE_CONFIG_PATH="/xxxxx/mooncake.json"
export VLLM_USE_V1=1
export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7 export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7
export ACL_OP_INIT_MODE=1 export ACL_OP_INIT_MODE=1
export ASCEND_BUFFER_POOL=4:8 export ASCEND_BUFFER_POOL=4:8
@@ -246,7 +244,6 @@ Content of mixed_department.sh:
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json" export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json"
export VLLM_USE_V1=1
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
export ACL_OP_INIT_MODE=1 export ACL_OP_INIT_MODE=1
export ASCEND_BUFFER_POOL=4:8 export ASCEND_BUFFER_POOL=4:8

View File

@@ -41,7 +41,6 @@ export HCCL_SOCKET_IFNAME="eth0"
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100 export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export VLLM_ASCEND_LLMDD_RPC_PORT=5559 export VLLM_ASCEND_LLMDD_RPC_PORT=5559
vllm serve /models/deepseek_r1_w8a8 \ vllm serve /models/deepseek_r1_w8a8 \
@@ -82,7 +81,6 @@ export HCCL_SOCKET_IFNAME="eth0"
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100 export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export VLLM_ASCEND_LLMDD_RPC_PORT=5659 export VLLM_ASCEND_LLMDD_RPC_PORT=5659
vllm serve /models/deepseek_r1_w8a8 \ vllm serve /models/deepseek_r1_w8a8 \
@@ -126,7 +124,6 @@ export HCCL_SOCKET_IFNAME="eth0"
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100 export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export VLLM_ASCEND_LLMDD_RPC_PORT=5759 export VLLM_ASCEND_LLMDD_RPC_PORT=5759
vllm serve /models/deepseek_r1_w8a8 \ vllm serve /models/deepseek_r1_w8a8 \
@@ -168,7 +165,6 @@ export HCCL_SOCKET_IFNAME="eth0"
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100 export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export VLLM_ASCEND_LLMDD_RPC_PORT=5859 export VLLM_ASCEND_LLMDD_RPC_PORT=5859
vllm serve /models/deepseek_r1_w8a8 \ vllm serve /models/deepseek_r1_w8a8 \

View File

@@ -7,8 +7,6 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=path-to-rank-table
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100 export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
vllm serve model_path \ vllm serve model_path \
--host 0.0.0.0 \ --host 0.0.0.0 \
--port 20002 \ --port 20002 \

View File

@@ -11,8 +11,6 @@ export HCCL_DETERMINISTIC=True
export HCCL_BUFFSIZE=1024 export HCCL_BUFFSIZE=1024
export TASK_QUEUE_ENABLE=1 export TASK_QUEUE_ENABLE=1
export VLLM_USE_V1=1
export ASCEND_RT_VISIBLE_DEVICES=$1 export ASCEND_RT_VISIBLE_DEVICES=$1
vllm serve model_path \ vllm serve model_path \

View File

@@ -7,7 +7,6 @@ export HCCL_SOCKET_IFNAME="eth0"
export OMP_PROC_BIND=false export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100 export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export VLLM_USE_MODELSCOPE=true export VLLM_USE_MODELSCOPE=true
export ASCEND_LAUNCH_BLOCKING=0 export ASCEND_LAUNCH_BLOCKING=0

View File

@@ -72,7 +72,6 @@ async def test_models(model: str) -> None:
"OMP_PROC_BIND": "false", "OMP_PROC_BIND": "false",
"HCCL_BUFFSIZE": "1024", "HCCL_BUFFSIZE": "1024",
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True", "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
"VLLM_USE_V1": "1"
} }
additional_config = { additional_config = {
"ascend_scheduler_config": { "ascend_scheduler_config": {

View File

@@ -73,7 +73,6 @@ async def test_models(model: str, mode: str, tp_size: int) -> None:
env_dict = { env_dict = {
"TASK_QUEUE_ENABLE": "1", "TASK_QUEUE_ENABLE": "1",
"OMP_PROC_BIND": "false", "OMP_PROC_BIND": "false",
"VLLM_USE_V1": "1",
"HCCL_OP_EXPANSION_MODE": "AIV", "HCCL_OP_EXPANSION_MODE": "AIV",
"VLLM_ASCEND_ENABLE_FLASHCOMM": "1", "VLLM_ASCEND_ENABLE_FLASHCOMM": "1",
"VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1", "VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1",

View File

@@ -2,7 +2,6 @@
export LCCL_DETERMINISTIC=1 export LCCL_DETERMINISTIC=1
export HCCL_DETERMINISTIC=true export HCCL_DETERMINISTIC=true
export CLOSE_MATMUL_K_SHIFT=1 export CLOSE_MATMUL_K_SHIFT=1
export VLLM_USE_V1=1
set -xe set -xe

View File

@@ -30,8 +30,6 @@ from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"] MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
reason="aclgraph only support on v1")
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [4]) @pytest.mark.parametrize("max_tokens", [4])
@patch.dict(os.environ, {"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": "0"}) @patch.dict(os.environ, {"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": "0"})

View File

@@ -27,7 +27,6 @@ else:
from vllm.utils.hashing import sha256 from vllm.utils.hashing import sha256
EOS_TOKEN_ID = 50256 EOS_TOKEN_ID = 50256
os.environ["VLLM_USE_V1"] = "1"
def assert_scheduler_empty(scheduler: Scheduler): def assert_scheduler_empty(scheduler: Scheduler):

View File

@@ -9,7 +9,6 @@ import torch
from einops import rearrange from einops import rearrange
from torch import nn from torch import nn
from transformers.activations import ACT2FN from transformers.activations import ACT2FN
from vllm import envs
from vllm.attention import AttentionBackend, AttentionMetadata from vllm.attention import AttentionBackend, AttentionMetadata
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import (CacheConfig, ModelConfig, SpeculativeConfig, from vllm.config import (CacheConfig, ModelConfig, SpeculativeConfig,
@@ -668,7 +667,6 @@ class CustomQwen3NextForCausalLM(Qwen3NextForCausalLM):
scheduler_config = vllm_config.scheduler_config scheduler_config = vllm_config.scheduler_config
assert not cache_config.enable_prefix_caching, \ assert not cache_config.enable_prefix_caching, \
"Qwen3Next currently does not support prefix caching" "Qwen3Next currently does not support prefix caching"
assert envs.VLLM_USE_V1, "Qwen3Next requires VLLM_USE_V1"
self.quant_config = vllm_config.quant_config self.quant_config = vllm_config.quant_config
self.config = config self.config = config
self.scheduler_config = scheduler_config self.scheduler_config = scheduler_config

View File

@@ -1,6 +1,5 @@
import ast import ast
import vllm.envs as envs
from vllm.config.speculative import SpeculativeConfig from vllm.config.speculative import SpeculativeConfig
from vllm.logger import logger from vllm.logger import logger
@@ -163,11 +162,6 @@ def __post_init__(self):
# Replace hf_config for EAGLE draft_model # Replace hf_config for EAGLE draft_model
if self.method in ("eagle", "eagle3"): if self.method in ("eagle", "eagle3"):
if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
raise ValueError(
"Chunked prefill and EAGLE are not compatible "
"when using V0.")
from vllm.transformers_utils.configs import SpeculatorsConfig from vllm.transformers_utils.configs import SpeculatorsConfig
from vllm.transformers_utils.configs.eagle import EAGLEConfig from vllm.transformers_utils.configs.eagle import EAGLEConfig

View File

@@ -20,7 +20,6 @@ import os
from typing import TYPE_CHECKING, Optional, Tuple from typing import TYPE_CHECKING, Optional, Tuple
import torch import torch
import vllm.envs as envs_vllm
from vllm.logger import logger from vllm.logger import logger
from vllm.platforms import Platform, PlatformEnum from vllm.platforms import Platform, PlatformEnum
@@ -117,8 +116,6 @@ class NPUPlatform(Platform):
@classmethod @classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None: def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if not envs_vllm.VLLM_USE_V1:
raise ValueError("vLLM Ascend does not support V0 engine.")
# initialize ascend config from vllm additional_config # initialize ascend config from vllm additional_config
ascend_config = init_ascend_config(vllm_config) ascend_config = init_ascend_config(vllm_config)

View File

@@ -21,7 +21,6 @@ from typing import Any, List, Optional, Union
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
import vllm import vllm
import vllm.envs as envs
from torch import nn from torch import nn
from transformers import Qwen2Config from transformers import Qwen2Config
from vllm.attention import AttentionMetadata, AttentionType from vllm.attention import AttentionMetadata, AttentionType
@@ -112,12 +111,9 @@ class CustomQwen2Attention(Qwen2Attention):
is_prefill=False, is_prefill=False,
is_qwen_torchair=True) is_qwen_torchair=True)
forward_kwargs = {} forward_kwargs = {}
if envs.VLLM_USE_V1: output_shape = q.shape
output_shape = q.shape output = torch.empty(output_shape, dtype=q.dtype, device=q.device)
output = torch.empty(output_shape, forward_kwargs['output'] = output
dtype=q.dtype,
device=q.device)
forward_kwargs['output'] = output
attn_output = self.attn.impl.forward(self.attn, attn_output = self.attn.impl.forward(self.attn,
q, q,

View File

@@ -19,7 +19,6 @@
from typing import Any, List, Optional, Union from typing import Any, List, Optional, Union
import torch import torch
import vllm.envs as envs
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.attention import Attention, AttentionMetadata from vllm.attention import Attention, AttentionMetadata
@@ -244,12 +243,9 @@ class CustomQwen3MoeAttention(Qwen3MoeAttention):
is_prefill=False, is_prefill=False,
is_qwen_torchair=True) is_qwen_torchair=True)
forward_kwargs = {} forward_kwargs = {}
if envs.VLLM_USE_V1: output_shape = q.shape
output_shape = q.shape output = torch.empty(output_shape, dtype=q.dtype, device=q.device)
output = torch.empty(output_shape, forward_kwargs['output'] = output
dtype=q.dtype,
device=q.device)
forward_kwargs['output'] = output
attn_output = self.attn.impl.forward(self.attn, attn_output = self.attn.impl.forward(self.attn,
q, q,