Remove VLLM_USE_V1 (#4086)
Drop VLLM_USE_V1 usage. This env has been removed from vLLM already.
- vLLM version: v0.11.0
- vLLM main:
83f478bb19
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
5
.github/workflows/nightly_benchmarks.yaml
vendored
5
.github/workflows/nightly_benchmarks.yaml
vendored
@@ -46,14 +46,13 @@ jobs:
|
|||||||
test:
|
test:
|
||||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'performance-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
|
if: ${{ contains(github.event.pull_request.labels.*.name, 'performance-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
|
||||||
|
|
||||||
name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }}, use_v1=${{ matrix.vllm_use_v1 }}
|
name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }}
|
||||||
runs-on: 'linux-arm64-npu-static-8'
|
runs-on: 'linux-arm64-npu-static-8'
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- vllm_branch: v0.11.0
|
- vllm_branch: v0.11.0
|
||||||
vllm_ascend_branch: main
|
vllm_ascend_branch: main
|
||||||
vllm_use_v1: 1
|
|
||||||
max-parallel: 1
|
max-parallel: 1
|
||||||
container:
|
container:
|
||||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
|
||||||
@@ -73,7 +72,6 @@ jobs:
|
|||||||
VLLM_USE_MODELSCOPE: True
|
VLLM_USE_MODELSCOPE: True
|
||||||
ES_OM_DOMAIN: ${{ secrets.ES_OM_DOMAIN }}
|
ES_OM_DOMAIN: ${{ secrets.ES_OM_DOMAIN }}
|
||||||
ES_OM_AUTHORIZATION: ${{ secrets.ES_OM_AUTHORIZATION }}
|
ES_OM_AUTHORIZATION: ${{ secrets.ES_OM_AUTHORIZATION }}
|
||||||
VLLM_USE_V1: ${{ matrix.vllm_use_v1 }}
|
|
||||||
steps:
|
steps:
|
||||||
- name: Check npu and CANN info
|
- name: Check npu and CANN info
|
||||||
run: |
|
run: |
|
||||||
@@ -200,7 +198,6 @@ jobs:
|
|||||||
--created_at "$commit_time_no_tz" \
|
--created_at "$commit_time_no_tz" \
|
||||||
--res_dir ./benchmarks/results \
|
--res_dir ./benchmarks/results \
|
||||||
--error "$ERROR_MSG" \
|
--error "$ERROR_MSG" \
|
||||||
--extra_feat '{"VLLM_USE_V1": "${{ matrix.vllm_use_v1 }}"}'
|
|
||||||
rm -rf ./benchmarks/results
|
rm -rf ./benchmarks/results
|
||||||
cd -
|
cd -
|
||||||
done < commit_log.txt
|
done < commit_log.txt
|
||||||
|
|||||||
@@ -160,7 +160,6 @@ export TP_SOCKET_IFNAME=$nic_name
|
|||||||
export HCCL_SOCKET_IFNAME=$nic_name
|
export HCCL_SOCKET_IFNAME=$nic_name
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=100
|
export OMP_NUM_THREADS=100
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export HCCL_BUFFSIZE=1024
|
export HCCL_BUFFSIZE=1024
|
||||||
|
|
||||||
vllm serve vllm-ascend/DeepSeek-V3.1-W8A8 \
|
vllm serve vllm-ascend/DeepSeek-V3.1-W8A8 \
|
||||||
|
|||||||
@@ -70,7 +70,6 @@ export TP_SOCKET_IFNAME=$nic_name
|
|||||||
export HCCL_SOCKET_IFNAME=$nic_name
|
export HCCL_SOCKET_IFNAME=$nic_name
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=100
|
export OMP_NUM_THREADS=100
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export HCCL_BUFFSIZE=1024
|
export HCCL_BUFFSIZE=1024
|
||||||
|
|
||||||
# The w8a8 weight can be obtained from https://www.modelscope.cn/models/vllm-ascend/Kimi-K2-Instruct-W8A8
|
# The w8a8 weight can be obtained from https://www.modelscope.cn/models/vllm-ascend/Kimi-K2-Instruct-W8A8
|
||||||
@@ -116,7 +115,6 @@ export TP_SOCKET_IFNAME=$nic_name
|
|||||||
export HCCL_SOCKET_IFNAME=$nic_name
|
export HCCL_SOCKET_IFNAME=$nic_name
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=100
|
export OMP_NUM_THREADS=100
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export HCCL_BUFFSIZE=1024
|
export HCCL_BUFFSIZE=1024
|
||||||
|
|
||||||
vllm serve /home/cache/weights/Kimi-K2-Instruct-W8A8 \
|
vllm serve /home/cache/weights/Kimi-K2-Instruct-W8A8 \
|
||||||
|
|||||||
@@ -104,7 +104,6 @@ export HCCL_SOCKET_IFNAME="eth0"
|
|||||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
|
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
export VLLM_USE_V1=1
|
|
||||||
|
|
||||||
vllm serve /model/Qwen3-30B-A3B \
|
vllm serve /model/Qwen3-30B-A3B \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
@@ -144,7 +143,6 @@ export HCCL_SOCKET_IFNAME="eth0"
|
|||||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
|
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
export VLLM_USE_V1=1
|
|
||||||
|
|
||||||
vllm serve /model/Qwen3-30B-A3B \
|
vllm serve /model/Qwen3-30B-A3B \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
@@ -183,7 +181,6 @@ export HCCL_SOCKET_IFNAME="eth0"
|
|||||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
|
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
export VLLM_USE_V1=1
|
|
||||||
|
|
||||||
vllm serve /model/Qwen3-30B-A3B \
|
vllm serve /model/Qwen3-30B-A3B \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
|
|||||||
@@ -107,7 +107,6 @@ export HCCL_IF_IP=192.0.0.1
|
|||||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||||
export TP_SOCKET_IFNAME="eth0"
|
export TP_SOCKET_IFNAME="eth0"
|
||||||
export HCCL_SOCKET_IFNAME="eth0"
|
export HCCL_SOCKET_IFNAME="eth0"
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export HCCL_BUFFSIZE=1024
|
export HCCL_BUFFSIZE=1024
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
@@ -162,7 +161,6 @@ export HCCL_IF_IP=192.0.0.2
|
|||||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||||
export TP_SOCKET_IFNAME="eth0"
|
export TP_SOCKET_IFNAME="eth0"
|
||||||
export HCCL_SOCKET_IFNAME="eth0"
|
export HCCL_SOCKET_IFNAME="eth0"
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export HCCL_BUFFSIZE=1024
|
export HCCL_BUFFSIZE=1024
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
@@ -217,7 +215,6 @@ export HCCL_IF_IP=192.0.0.3
|
|||||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||||
export TP_SOCKET_IFNAME="eth0"
|
export TP_SOCKET_IFNAME="eth0"
|
||||||
export HCCL_SOCKET_IFNAME="eth0"
|
export HCCL_SOCKET_IFNAME="eth0"
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export HCCL_BUFFSIZE=2048
|
export HCCL_BUFFSIZE=2048
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
@@ -274,7 +271,6 @@ export HCCL_IF_IP=192.0.0.4
|
|||||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||||
export TP_SOCKET_IFNAME="eth0"
|
export TP_SOCKET_IFNAME="eth0"
|
||||||
export HCCL_SOCKET_IFNAME="eth0"
|
export HCCL_SOCKET_IFNAME="eth0"
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export HCCL_BUFFSIZE=2048
|
export HCCL_BUFFSIZE=2048
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
@@ -338,7 +334,6 @@ export HCCL_IF_IP=192.0.0.1
|
|||||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||||
export TP_SOCKET_IFNAME="eth0"
|
export TP_SOCKET_IFNAME="eth0"
|
||||||
export HCCL_SOCKET_IFNAME="eth0"
|
export HCCL_SOCKET_IFNAME="eth0"
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export HCCL_BUFFSIZE=1024
|
export HCCL_BUFFSIZE=1024
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
@@ -393,7 +388,6 @@ export HCCL_IF_IP=192.0.0.2
|
|||||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||||
export TP_SOCKET_IFNAME="eth0"
|
export TP_SOCKET_IFNAME="eth0"
|
||||||
export HCCL_SOCKET_IFNAME="eth0"
|
export HCCL_SOCKET_IFNAME="eth0"
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export HCCL_BUFFSIZE=1024
|
export HCCL_BUFFSIZE=1024
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
@@ -448,7 +442,6 @@ export HCCL_IF_IP=192.0.0.3
|
|||||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||||
export TP_SOCKET_IFNAME="eth0"
|
export TP_SOCKET_IFNAME="eth0"
|
||||||
export HCCL_SOCKET_IFNAME="eth0"
|
export HCCL_SOCKET_IFNAME="eth0"
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export HCCL_BUFFSIZE=2048
|
export HCCL_BUFFSIZE=2048
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
@@ -505,7 +498,6 @@ export HCCL_IF_IP=192.0.0.4
|
|||||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||||
export TP_SOCKET_IFNAME="eth0"
|
export TP_SOCKET_IFNAME="eth0"
|
||||||
export HCCL_SOCKET_IFNAME="eth0"
|
export HCCL_SOCKET_IFNAME="eth0"
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export HCCL_BUFFSIZE=2048
|
export HCCL_BUFFSIZE=2048
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=10
|
export OMP_NUM_THREADS=10
|
||||||
|
|||||||
@@ -70,7 +70,6 @@ export TP_SOCKET_IFNAME=$nic_name
|
|||||||
export HCCL_SOCKET_IFNAME=$nic_name
|
export HCCL_SOCKET_IFNAME=$nic_name
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=100
|
export OMP_NUM_THREADS=100
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export HCCL_BUFFSIZE=1024
|
export HCCL_BUFFSIZE=1024
|
||||||
|
|
||||||
vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \
|
vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \
|
||||||
@@ -112,7 +111,6 @@ export TP_SOCKET_IFNAME=$nic_name
|
|||||||
export HCCL_SOCKET_IFNAME=$nic_name
|
export HCCL_SOCKET_IFNAME=$nic_name
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=100
|
export OMP_NUM_THREADS=100
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export HCCL_BUFFSIZE=1024
|
export HCCL_BUFFSIZE=1024
|
||||||
|
|
||||||
vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \
|
vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \
|
||||||
|
|||||||
@@ -82,7 +82,6 @@ The content of the multi_producer.sh script:
|
|||||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||||
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
|
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
|
||||||
export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json"
|
export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json"
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
|
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
|
||||||
export ACL_OP_INIT_MODE=1
|
export ACL_OP_INIT_MODE=1
|
||||||
export ASCEND_BUFFER_POOL=4:8
|
export ASCEND_BUFFER_POOL=4:8
|
||||||
@@ -145,7 +144,6 @@ The content of multi_consumer.sh:
|
|||||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||||
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
|
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
|
||||||
export MOONCAKE_CONFIG_PATH="/xxxxx/mooncake.json"
|
export MOONCAKE_CONFIG_PATH="/xxxxx/mooncake.json"
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7
|
export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7
|
||||||
export ACL_OP_INIT_MODE=1
|
export ACL_OP_INIT_MODE=1
|
||||||
export ASCEND_BUFFER_POOL=4:8
|
export ASCEND_BUFFER_POOL=4:8
|
||||||
@@ -246,7 +244,6 @@ Content of mixed_department.sh:
|
|||||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||||
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
|
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
|
||||||
export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json"
|
export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json"
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
|
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
|
||||||
export ACL_OP_INIT_MODE=1
|
export ACL_OP_INIT_MODE=1
|
||||||
export ASCEND_BUFFER_POOL=4:8
|
export ASCEND_BUFFER_POOL=4:8
|
||||||
|
|||||||
@@ -41,7 +41,6 @@ export HCCL_SOCKET_IFNAME="eth0"
|
|||||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
|
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=100
|
export OMP_NUM_THREADS=100
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5559
|
export VLLM_ASCEND_LLMDD_RPC_PORT=5559
|
||||||
|
|
||||||
vllm serve /models/deepseek_r1_w8a8 \
|
vllm serve /models/deepseek_r1_w8a8 \
|
||||||
@@ -82,7 +81,6 @@ export HCCL_SOCKET_IFNAME="eth0"
|
|||||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
|
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=100
|
export OMP_NUM_THREADS=100
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5659
|
export VLLM_ASCEND_LLMDD_RPC_PORT=5659
|
||||||
|
|
||||||
vllm serve /models/deepseek_r1_w8a8 \
|
vllm serve /models/deepseek_r1_w8a8 \
|
||||||
@@ -126,7 +124,6 @@ export HCCL_SOCKET_IFNAME="eth0"
|
|||||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
|
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=100
|
export OMP_NUM_THREADS=100
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5759
|
export VLLM_ASCEND_LLMDD_RPC_PORT=5759
|
||||||
|
|
||||||
vllm serve /models/deepseek_r1_w8a8 \
|
vllm serve /models/deepseek_r1_w8a8 \
|
||||||
@@ -168,7 +165,6 @@ export HCCL_SOCKET_IFNAME="eth0"
|
|||||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
|
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json
|
||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=100
|
export OMP_NUM_THREADS=100
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5859
|
export VLLM_ASCEND_LLMDD_RPC_PORT=5859
|
||||||
|
|
||||||
vllm serve /models/deepseek_r1_w8a8 \
|
vllm serve /models/deepseek_r1_w8a8 \
|
||||||
|
|||||||
@@ -7,8 +7,6 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=path-to-rank-table
|
|||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=100
|
export OMP_NUM_THREADS=100
|
||||||
|
|
||||||
export VLLM_USE_V1=1
|
|
||||||
|
|
||||||
vllm serve model_path \
|
vllm serve model_path \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
--port 20002 \
|
--port 20002 \
|
||||||
|
|||||||
@@ -11,8 +11,6 @@ export HCCL_DETERMINISTIC=True
|
|||||||
export HCCL_BUFFSIZE=1024
|
export HCCL_BUFFSIZE=1024
|
||||||
export TASK_QUEUE_ENABLE=1
|
export TASK_QUEUE_ENABLE=1
|
||||||
|
|
||||||
export VLLM_USE_V1=1
|
|
||||||
|
|
||||||
export ASCEND_RT_VISIBLE_DEVICES=$1
|
export ASCEND_RT_VISIBLE_DEVICES=$1
|
||||||
|
|
||||||
vllm serve model_path \
|
vllm serve model_path \
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ export HCCL_SOCKET_IFNAME="eth0"
|
|||||||
export OMP_PROC_BIND=false
|
export OMP_PROC_BIND=false
|
||||||
export OMP_NUM_THREADS=100
|
export OMP_NUM_THREADS=100
|
||||||
|
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export VLLM_USE_MODELSCOPE=true
|
export VLLM_USE_MODELSCOPE=true
|
||||||
|
|
||||||
export ASCEND_LAUNCH_BLOCKING=0
|
export ASCEND_LAUNCH_BLOCKING=0
|
||||||
|
|||||||
@@ -72,7 +72,6 @@ async def test_models(model: str) -> None:
|
|||||||
"OMP_PROC_BIND": "false",
|
"OMP_PROC_BIND": "false",
|
||||||
"HCCL_BUFFSIZE": "1024",
|
"HCCL_BUFFSIZE": "1024",
|
||||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
||||||
"VLLM_USE_V1": "1"
|
|
||||||
}
|
}
|
||||||
additional_config = {
|
additional_config = {
|
||||||
"ascend_scheduler_config": {
|
"ascend_scheduler_config": {
|
||||||
|
|||||||
@@ -73,7 +73,6 @@ async def test_models(model: str, mode: str, tp_size: int) -> None:
|
|||||||
env_dict = {
|
env_dict = {
|
||||||
"TASK_QUEUE_ENABLE": "1",
|
"TASK_QUEUE_ENABLE": "1",
|
||||||
"OMP_PROC_BIND": "false",
|
"OMP_PROC_BIND": "false",
|
||||||
"VLLM_USE_V1": "1",
|
|
||||||
"HCCL_OP_EXPANSION_MODE": "AIV",
|
"HCCL_OP_EXPANSION_MODE": "AIV",
|
||||||
"VLLM_ASCEND_ENABLE_FLASHCOMM": "1",
|
"VLLM_ASCEND_ENABLE_FLASHCOMM": "1",
|
||||||
"VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1",
|
"VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1",
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
export LCCL_DETERMINISTIC=1
|
export LCCL_DETERMINISTIC=1
|
||||||
export HCCL_DETERMINISTIC=true
|
export HCCL_DETERMINISTIC=true
|
||||||
export CLOSE_MATMUL_K_SHIFT=1
|
export CLOSE_MATMUL_K_SHIFT=1
|
||||||
export VLLM_USE_V1=1
|
|
||||||
|
|
||||||
set -xe
|
set -xe
|
||||||
|
|
||||||
|
|||||||
@@ -30,8 +30,6 @@ from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
|||||||
MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
|
MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
|
|
||||||
reason="aclgraph only support on v1")
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@pytest.mark.parametrize("max_tokens", [4])
|
@pytest.mark.parametrize("max_tokens", [4])
|
||||||
@patch.dict(os.environ, {"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": "0"})
|
@patch.dict(os.environ, {"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": "0"})
|
||||||
|
|||||||
@@ -27,7 +27,6 @@ else:
|
|||||||
from vllm.utils.hashing import sha256
|
from vllm.utils.hashing import sha256
|
||||||
|
|
||||||
EOS_TOKEN_ID = 50256
|
EOS_TOKEN_ID = 50256
|
||||||
os.environ["VLLM_USE_V1"] = "1"
|
|
||||||
|
|
||||||
|
|
||||||
def assert_scheduler_empty(scheduler: Scheduler):
|
def assert_scheduler_empty(scheduler: Scheduler):
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ import torch
|
|||||||
from einops import rearrange
|
from einops import rearrange
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers.activations import ACT2FN
|
from transformers.activations import ACT2FN
|
||||||
from vllm import envs
|
|
||||||
from vllm.attention import AttentionBackend, AttentionMetadata
|
from vllm.attention import AttentionBackend, AttentionMetadata
|
||||||
from vllm.compilation.decorators import support_torch_compile
|
from vllm.compilation.decorators import support_torch_compile
|
||||||
from vllm.config import (CacheConfig, ModelConfig, SpeculativeConfig,
|
from vllm.config import (CacheConfig, ModelConfig, SpeculativeConfig,
|
||||||
@@ -668,7 +667,6 @@ class CustomQwen3NextForCausalLM(Qwen3NextForCausalLM):
|
|||||||
scheduler_config = vllm_config.scheduler_config
|
scheduler_config = vllm_config.scheduler_config
|
||||||
assert not cache_config.enable_prefix_caching, \
|
assert not cache_config.enable_prefix_caching, \
|
||||||
"Qwen3Next currently does not support prefix caching"
|
"Qwen3Next currently does not support prefix caching"
|
||||||
assert envs.VLLM_USE_V1, "Qwen3Next requires VLLM_USE_V1"
|
|
||||||
self.quant_config = vllm_config.quant_config
|
self.quant_config = vllm_config.quant_config
|
||||||
self.config = config
|
self.config = config
|
||||||
self.scheduler_config = scheduler_config
|
self.scheduler_config = scheduler_config
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
import ast
|
import ast
|
||||||
|
|
||||||
import vllm.envs as envs
|
|
||||||
from vllm.config.speculative import SpeculativeConfig
|
from vllm.config.speculative import SpeculativeConfig
|
||||||
from vllm.logger import logger
|
from vllm.logger import logger
|
||||||
|
|
||||||
@@ -163,11 +162,6 @@ def __post_init__(self):
|
|||||||
|
|
||||||
# Replace hf_config for EAGLE draft_model
|
# Replace hf_config for EAGLE draft_model
|
||||||
if self.method in ("eagle", "eagle3"):
|
if self.method in ("eagle", "eagle3"):
|
||||||
if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
|
|
||||||
raise ValueError(
|
|
||||||
"Chunked prefill and EAGLE are not compatible "
|
|
||||||
"when using V0.")
|
|
||||||
|
|
||||||
from vllm.transformers_utils.configs import SpeculatorsConfig
|
from vllm.transformers_utils.configs import SpeculatorsConfig
|
||||||
from vllm.transformers_utils.configs.eagle import EAGLEConfig
|
from vllm.transformers_utils.configs.eagle import EAGLEConfig
|
||||||
|
|
||||||
|
|||||||
@@ -20,7 +20,6 @@ import os
|
|||||||
from typing import TYPE_CHECKING, Optional, Tuple
|
from typing import TYPE_CHECKING, Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import vllm.envs as envs_vllm
|
|
||||||
from vllm.logger import logger
|
from vllm.logger import logger
|
||||||
from vllm.platforms import Platform, PlatformEnum
|
from vllm.platforms import Platform, PlatformEnum
|
||||||
|
|
||||||
@@ -117,8 +116,6 @@ class NPUPlatform(Platform):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||||
if not envs_vllm.VLLM_USE_V1:
|
|
||||||
raise ValueError("vLLM Ascend does not support V0 engine.")
|
|
||||||
# initialize ascend config from vllm additional_config
|
# initialize ascend config from vllm additional_config
|
||||||
ascend_config = init_ascend_config(vllm_config)
|
ascend_config = init_ascend_config(vllm_config)
|
||||||
|
|
||||||
|
|||||||
@@ -21,7 +21,6 @@ from typing import Any, List, Optional, Union
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
import vllm
|
import vllm
|
||||||
import vllm.envs as envs
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import Qwen2Config
|
from transformers import Qwen2Config
|
||||||
from vllm.attention import AttentionMetadata, AttentionType
|
from vllm.attention import AttentionMetadata, AttentionType
|
||||||
@@ -112,11 +111,8 @@ class CustomQwen2Attention(Qwen2Attention):
|
|||||||
is_prefill=False,
|
is_prefill=False,
|
||||||
is_qwen_torchair=True)
|
is_qwen_torchair=True)
|
||||||
forward_kwargs = {}
|
forward_kwargs = {}
|
||||||
if envs.VLLM_USE_V1:
|
|
||||||
output_shape = q.shape
|
output_shape = q.shape
|
||||||
output = torch.empty(output_shape,
|
output = torch.empty(output_shape, dtype=q.dtype, device=q.device)
|
||||||
dtype=q.dtype,
|
|
||||||
device=q.device)
|
|
||||||
forward_kwargs['output'] = output
|
forward_kwargs['output'] = output
|
||||||
|
|
||||||
attn_output = self.attn.impl.forward(self.attn,
|
attn_output = self.attn.impl.forward(self.attn,
|
||||||
|
|||||||
@@ -19,7 +19,6 @@
|
|||||||
from typing import Any, List, Optional, Union
|
from typing import Any, List, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import vllm.envs as envs
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
from vllm.attention import Attention, AttentionMetadata
|
from vllm.attention import Attention, AttentionMetadata
|
||||||
@@ -244,11 +243,8 @@ class CustomQwen3MoeAttention(Qwen3MoeAttention):
|
|||||||
is_prefill=False,
|
is_prefill=False,
|
||||||
is_qwen_torchair=True)
|
is_qwen_torchair=True)
|
||||||
forward_kwargs = {}
|
forward_kwargs = {}
|
||||||
if envs.VLLM_USE_V1:
|
|
||||||
output_shape = q.shape
|
output_shape = q.shape
|
||||||
output = torch.empty(output_shape,
|
output = torch.empty(output_shape, dtype=q.dtype, device=q.device)
|
||||||
dtype=q.dtype,
|
|
||||||
device=q.device)
|
|
||||||
forward_kwargs['output'] = output
|
forward_kwargs['output'] = output
|
||||||
|
|
||||||
attn_output = self.attn.impl.forward(self.attn,
|
attn_output = self.attn.impl.forward(self.attn,
|
||||||
|
|||||||
Reference in New Issue
Block a user