From f811a24bf0cef84840d7d7c26629da15e00fd05f Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Tue, 11 Nov 2025 15:43:39 +0800 Subject: [PATCH] Remove VLLM_USE_V1 (#4086) Drop VLLM_USE_V1 usage. This env has been removed from vLLM already. - vLLM version: v0.11.0 - vLLM main: https://github.com/vllm-project/vllm/commit/83f478bb19489b41e9d208b47b4bb5a95ac171ac Signed-off-by: wangxiyuan --- .github/workflows/nightly_benchmarks.yaml | 5 +---- docs/source/tutorials/multi_node.md | 1 - docs/source/tutorials/multi_node_kimi.md | 2 -- .../multi_node_pd_disaggregation_llmdatadist.md | 3 --- .../tutorials/multi_node_pd_disaggregation_mooncake.md | 8 -------- docs/source/tutorials/multi_node_qwen3vl.md | 2 -- .../user_guide/feature_guide/kv_pool_mooncake.md | 3 --- examples/disaggregated_prefill_v1/README.md | 4 ---- examples/disaggregated_prefill_v1/run_server.sh | 2 -- examples/external_online_dp/run_dp_template.sh | 2 -- examples/run_dp_server.sh | 1 - .../test_prefix_cache_deepseek_r1_0528_w8a8.py | 1 - tests/e2e/nightly/models/test_qwq_32b.py | 1 - tests/e2e/pd_disaggreate/run_edge_case_test.sh | 1 - tests/e2e/singlecard/test_aclgraph_mem.py | 2 -- tests/ut/kv_connector/utils.py | 1 - vllm_ascend/models/qwen3_next.py | 2 -- vllm_ascend/patch/platform/patch_config.py | 6 ------ vllm_ascend/platform.py | 3 --- vllm_ascend/torchair/models/qwen2.py | 10 +++------- vllm_ascend/torchair/models/qwen3_moe.py | 10 +++------- 21 files changed, 7 insertions(+), 63 deletions(-) diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index a768cd66..258c434a 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -46,14 +46,13 @@ jobs: test: if: ${{ contains(github.event.pull_request.labels.*.name, 'performance-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} - name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }}, use_v1=${{ matrix.vllm_use_v1 }} + name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }} runs-on: 'linux-arm64-npu-static-8' strategy: matrix: include: - vllm_branch: v0.11.0 vllm_ascend_branch: main - vllm_use_v1: 1 max-parallel: 1 container: image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11 @@ -73,7 +72,6 @@ jobs: VLLM_USE_MODELSCOPE: True ES_OM_DOMAIN: ${{ secrets.ES_OM_DOMAIN }} ES_OM_AUTHORIZATION: ${{ secrets.ES_OM_AUTHORIZATION }} - VLLM_USE_V1: ${{ matrix.vllm_use_v1 }} steps: - name: Check npu and CANN info run: | @@ -200,7 +198,6 @@ jobs: --created_at "$commit_time_no_tz" \ --res_dir ./benchmarks/results \ --error "$ERROR_MSG" \ - --extra_feat '{"VLLM_USE_V1": "${{ matrix.vllm_use_v1 }}"}' rm -rf ./benchmarks/results cd - done < commit_log.txt diff --git a/docs/source/tutorials/multi_node.md b/docs/source/tutorials/multi_node.md index 4471f823..6738ee4f 100644 --- a/docs/source/tutorials/multi_node.md +++ b/docs/source/tutorials/multi_node.md @@ -160,7 +160,6 @@ export TP_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 -export VLLM_USE_V1=1 export HCCL_BUFFSIZE=1024 vllm serve vllm-ascend/DeepSeek-V3.1-W8A8 \ diff --git a/docs/source/tutorials/multi_node_kimi.md b/docs/source/tutorials/multi_node_kimi.md index 0711aaff..1a5fe2eb 100644 --- a/docs/source/tutorials/multi_node_kimi.md +++ b/docs/source/tutorials/multi_node_kimi.md @@ -70,7 +70,6 @@ export TP_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 -export VLLM_USE_V1=1 export HCCL_BUFFSIZE=1024 # The w8a8 weight can be obtained from https://www.modelscope.cn/models/vllm-ascend/Kimi-K2-Instruct-W8A8 @@ -116,7 +115,6 @@ export TP_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 -export VLLM_USE_V1=1 export HCCL_BUFFSIZE=1024 vllm serve /home/cache/weights/Kimi-K2-Instruct-W8A8 \ diff --git a/docs/source/tutorials/multi_node_pd_disaggregation_llmdatadist.md b/docs/source/tutorials/multi_node_pd_disaggregation_llmdatadist.md index c0c2abac..3bd06daa 100644 --- a/docs/source/tutorials/multi_node_pd_disaggregation_llmdatadist.md +++ b/docs/source/tutorials/multi_node_pd_disaggregation_llmdatadist.md @@ -104,7 +104,6 @@ export HCCL_SOCKET_IFNAME="eth0" export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json" export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 -export VLLM_USE_V1=1 vllm serve /model/Qwen3-30B-A3B \ --host 0.0.0.0 \ @@ -144,7 +143,6 @@ export HCCL_SOCKET_IFNAME="eth0" export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json" export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 -export VLLM_USE_V1=1 vllm serve /model/Qwen3-30B-A3B \ --host 0.0.0.0 \ @@ -183,7 +181,6 @@ export HCCL_SOCKET_IFNAME="eth0" export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json" export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 -export VLLM_USE_V1=1 vllm serve /model/Qwen3-30B-A3B \ --host 0.0.0.0 \ diff --git a/docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md b/docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md index 1db83e07..6406cf1d 100644 --- a/docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md +++ b/docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md @@ -107,7 +107,6 @@ export HCCL_IF_IP=192.0.0.1 export GLOO_SOCKET_IFNAME="eth0" # network card name export TP_SOCKET_IFNAME="eth0" export HCCL_SOCKET_IFNAME="eth0" -export VLLM_USE_V1=1 export HCCL_BUFFSIZE=1024 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 @@ -162,7 +161,6 @@ export HCCL_IF_IP=192.0.0.2 export GLOO_SOCKET_IFNAME="eth0" # network card name export TP_SOCKET_IFNAME="eth0" export HCCL_SOCKET_IFNAME="eth0" -export VLLM_USE_V1=1 export HCCL_BUFFSIZE=1024 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 @@ -217,7 +215,6 @@ export HCCL_IF_IP=192.0.0.3 export GLOO_SOCKET_IFNAME="eth0" # network card name export TP_SOCKET_IFNAME="eth0" export HCCL_SOCKET_IFNAME="eth0" -export VLLM_USE_V1=1 export HCCL_BUFFSIZE=2048 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 @@ -274,7 +271,6 @@ export HCCL_IF_IP=192.0.0.4 export GLOO_SOCKET_IFNAME="eth0" # network card name export TP_SOCKET_IFNAME="eth0" export HCCL_SOCKET_IFNAME="eth0" -export VLLM_USE_V1=1 export HCCL_BUFFSIZE=2048 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 @@ -338,7 +334,6 @@ export HCCL_IF_IP=192.0.0.1 export GLOO_SOCKET_IFNAME="eth0" # network card name export TP_SOCKET_IFNAME="eth0" export HCCL_SOCKET_IFNAME="eth0" -export VLLM_USE_V1=1 export HCCL_BUFFSIZE=1024 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 @@ -393,7 +388,6 @@ export HCCL_IF_IP=192.0.0.2 export GLOO_SOCKET_IFNAME="eth0" # network card name export TP_SOCKET_IFNAME="eth0" export HCCL_SOCKET_IFNAME="eth0" -export VLLM_USE_V1=1 export HCCL_BUFFSIZE=1024 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 @@ -448,7 +442,6 @@ export HCCL_IF_IP=192.0.0.3 export GLOO_SOCKET_IFNAME="eth0" # network card name export TP_SOCKET_IFNAME="eth0" export HCCL_SOCKET_IFNAME="eth0" -export VLLM_USE_V1=1 export HCCL_BUFFSIZE=2048 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 @@ -505,7 +498,6 @@ export HCCL_IF_IP=192.0.0.4 export GLOO_SOCKET_IFNAME="eth0" # network card name export TP_SOCKET_IFNAME="eth0" export HCCL_SOCKET_IFNAME="eth0" -export VLLM_USE_V1=1 export HCCL_BUFFSIZE=2048 export OMP_PROC_BIND=false export OMP_NUM_THREADS=10 diff --git a/docs/source/tutorials/multi_node_qwen3vl.md b/docs/source/tutorials/multi_node_qwen3vl.md index af64df15..033bc6dc 100644 --- a/docs/source/tutorials/multi_node_qwen3vl.md +++ b/docs/source/tutorials/multi_node_qwen3vl.md @@ -70,7 +70,6 @@ export TP_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 -export VLLM_USE_V1=1 export HCCL_BUFFSIZE=1024 vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \ @@ -112,7 +111,6 @@ export TP_SOCKET_IFNAME=$nic_name export HCCL_SOCKET_IFNAME=$nic_name export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 -export VLLM_USE_V1=1 export HCCL_BUFFSIZE=1024 vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \ diff --git a/docs/source/user_guide/feature_guide/kv_pool_mooncake.md b/docs/source/user_guide/feature_guide/kv_pool_mooncake.md index 34ab0479..c3693868 100644 --- a/docs/source/user_guide/feature_guide/kv_pool_mooncake.md +++ b/docs/source/user_guide/feature_guide/kv_pool_mooncake.md @@ -82,7 +82,6 @@ The content of the multi_producer.sh script: export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json" -export VLLM_USE_V1=1 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 export ACL_OP_INIT_MODE=1 export ASCEND_BUFFER_POOL=4:8 @@ -145,7 +144,6 @@ The content of multi_consumer.sh: export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm export MOONCAKE_CONFIG_PATH="/xxxxx/mooncake.json" -export VLLM_USE_V1=1 export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7 export ACL_OP_INIT_MODE=1 export ASCEND_BUFFER_POOL=4:8 @@ -246,7 +244,6 @@ Content of mixed_department.sh: export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json" -export VLLM_USE_V1=1 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 export ACL_OP_INIT_MODE=1 export ASCEND_BUFFER_POOL=4:8 diff --git a/examples/disaggregated_prefill_v1/README.md b/examples/disaggregated_prefill_v1/README.md index 7a546a32..83f5565e 100644 --- a/examples/disaggregated_prefill_v1/README.md +++ b/examples/disaggregated_prefill_v1/README.md @@ -41,7 +41,6 @@ export HCCL_SOCKET_IFNAME="eth0" export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 -export VLLM_USE_V1=1 export VLLM_ASCEND_LLMDD_RPC_PORT=5559 vllm serve /models/deepseek_r1_w8a8 \ @@ -82,7 +81,6 @@ export HCCL_SOCKET_IFNAME="eth0" export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 -export VLLM_USE_V1=1 export VLLM_ASCEND_LLMDD_RPC_PORT=5659 vllm serve /models/deepseek_r1_w8a8 \ @@ -126,7 +124,6 @@ export HCCL_SOCKET_IFNAME="eth0" export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 -export VLLM_USE_V1=1 export VLLM_ASCEND_LLMDD_RPC_PORT=5759 vllm serve /models/deepseek_r1_w8a8 \ @@ -168,7 +165,6 @@ export HCCL_SOCKET_IFNAME="eth0" export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 -export VLLM_USE_V1=1 export VLLM_ASCEND_LLMDD_RPC_PORT=5859 vllm serve /models/deepseek_r1_w8a8 \ diff --git a/examples/disaggregated_prefill_v1/run_server.sh b/examples/disaggregated_prefill_v1/run_server.sh index 37cf6d3a..37833909 100644 --- a/examples/disaggregated_prefill_v1/run_server.sh +++ b/examples/disaggregated_prefill_v1/run_server.sh @@ -7,8 +7,6 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=path-to-rank-table export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 -export VLLM_USE_V1=1 - vllm serve model_path \ --host 0.0.0.0 \ --port 20002 \ diff --git a/examples/external_online_dp/run_dp_template.sh b/examples/external_online_dp/run_dp_template.sh index 70f27feb..ff1fca05 100644 --- a/examples/external_online_dp/run_dp_template.sh +++ b/examples/external_online_dp/run_dp_template.sh @@ -11,8 +11,6 @@ export HCCL_DETERMINISTIC=True export HCCL_BUFFSIZE=1024 export TASK_QUEUE_ENABLE=1 -export VLLM_USE_V1=1 - export ASCEND_RT_VISIBLE_DEVICES=$1 vllm serve model_path \ diff --git a/examples/run_dp_server.sh b/examples/run_dp_server.sh index 97258123..9b9868c4 100644 --- a/examples/run_dp_server.sh +++ b/examples/run_dp_server.sh @@ -7,7 +7,6 @@ export HCCL_SOCKET_IFNAME="eth0" export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 -export VLLM_USE_V1=1 export VLLM_USE_MODELSCOPE=true export ASCEND_LAUNCH_BLOCKING=0 diff --git a/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py index 67581e55..8ac1883d 100644 --- a/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py +++ b/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py @@ -72,7 +72,6 @@ async def test_models(model: str) -> None: "OMP_PROC_BIND": "false", "HCCL_BUFFSIZE": "1024", "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True", - "VLLM_USE_V1": "1" } additional_config = { "ascend_scheduler_config": { diff --git a/tests/e2e/nightly/models/test_qwq_32b.py b/tests/e2e/nightly/models/test_qwq_32b.py index a4f843a7..ad3dd6b2 100644 --- a/tests/e2e/nightly/models/test_qwq_32b.py +++ b/tests/e2e/nightly/models/test_qwq_32b.py @@ -73,7 +73,6 @@ async def test_models(model: str, mode: str, tp_size: int) -> None: env_dict = { "TASK_QUEUE_ENABLE": "1", "OMP_PROC_BIND": "false", - "VLLM_USE_V1": "1", "HCCL_OP_EXPANSION_MODE": "AIV", "VLLM_ASCEND_ENABLE_FLASHCOMM": "1", "VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1", diff --git a/tests/e2e/pd_disaggreate/run_edge_case_test.sh b/tests/e2e/pd_disaggreate/run_edge_case_test.sh index f0e7ace7..980edfbe 100644 --- a/tests/e2e/pd_disaggreate/run_edge_case_test.sh +++ b/tests/e2e/pd_disaggreate/run_edge_case_test.sh @@ -2,7 +2,6 @@ export LCCL_DETERMINISTIC=1 export HCCL_DETERMINISTIC=true export CLOSE_MATMUL_K_SHIFT=1 -export VLLM_USE_V1=1 set -xe diff --git a/tests/e2e/singlecard/test_aclgraph_mem.py b/tests/e2e/singlecard/test_aclgraph_mem.py index df7d355e..f7d578fb 100644 --- a/tests/e2e/singlecard/test_aclgraph_mem.py +++ b/tests/e2e/singlecard/test_aclgraph_mem.py @@ -30,8 +30,6 @@ from vllm_ascend.worker.model_runner_v1 import NPUModelRunner MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"] -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", - reason="aclgraph only support on v1") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [4]) @patch.dict(os.environ, {"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": "0"}) diff --git a/tests/ut/kv_connector/utils.py b/tests/ut/kv_connector/utils.py index 8c25ded6..1bc535c5 100644 --- a/tests/ut/kv_connector/utils.py +++ b/tests/ut/kv_connector/utils.py @@ -27,7 +27,6 @@ else: from vllm.utils.hashing import sha256 EOS_TOKEN_ID = 50256 -os.environ["VLLM_USE_V1"] = "1" def assert_scheduler_empty(scheduler: Scheduler): diff --git a/vllm_ascend/models/qwen3_next.py b/vllm_ascend/models/qwen3_next.py index b0bfde0e..622efe23 100644 --- a/vllm_ascend/models/qwen3_next.py +++ b/vllm_ascend/models/qwen3_next.py @@ -9,7 +9,6 @@ import torch from einops import rearrange from torch import nn from transformers.activations import ACT2FN -from vllm import envs from vllm.attention import AttentionBackend, AttentionMetadata from vllm.compilation.decorators import support_torch_compile from vllm.config import (CacheConfig, ModelConfig, SpeculativeConfig, @@ -668,7 +667,6 @@ class CustomQwen3NextForCausalLM(Qwen3NextForCausalLM): scheduler_config = vllm_config.scheduler_config assert not cache_config.enable_prefix_caching, \ "Qwen3Next currently does not support prefix caching" - assert envs.VLLM_USE_V1, "Qwen3Next requires VLLM_USE_V1" self.quant_config = vllm_config.quant_config self.config = config self.scheduler_config = scheduler_config diff --git a/vllm_ascend/patch/platform/patch_config.py b/vllm_ascend/patch/platform/patch_config.py index d6150383..0e8642d1 100644 --- a/vllm_ascend/patch/platform/patch_config.py +++ b/vllm_ascend/patch/platform/patch_config.py @@ -1,6 +1,5 @@ import ast -import vllm.envs as envs from vllm.config.speculative import SpeculativeConfig from vllm.logger import logger @@ -163,11 +162,6 @@ def __post_init__(self): # Replace hf_config for EAGLE draft_model if self.method in ("eagle", "eagle3"): - if self.enable_chunked_prefill and not envs.VLLM_USE_V1: - raise ValueError( - "Chunked prefill and EAGLE are not compatible " - "when using V0.") - from vllm.transformers_utils.configs import SpeculatorsConfig from vllm.transformers_utils.configs.eagle import EAGLEConfig diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 1ddd997d..0d295548 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -20,7 +20,6 @@ import os from typing import TYPE_CHECKING, Optional, Tuple import torch -import vllm.envs as envs_vllm from vllm.logger import logger from vllm.platforms import Platform, PlatformEnum @@ -117,8 +116,6 @@ class NPUPlatform(Platform): @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: - if not envs_vllm.VLLM_USE_V1: - raise ValueError("vLLM Ascend does not support V0 engine.") # initialize ascend config from vllm additional_config ascend_config = init_ascend_config(vllm_config) diff --git a/vllm_ascend/torchair/models/qwen2.py b/vllm_ascend/torchair/models/qwen2.py index a5a198e0..a61abbdc 100644 --- a/vllm_ascend/torchair/models/qwen2.py +++ b/vllm_ascend/torchair/models/qwen2.py @@ -21,7 +21,6 @@ from typing import Any, List, Optional, Union import torch import torch.nn.functional as F import vllm -import vllm.envs as envs from torch import nn from transformers import Qwen2Config from vllm.attention import AttentionMetadata, AttentionType @@ -112,12 +111,9 @@ class CustomQwen2Attention(Qwen2Attention): is_prefill=False, is_qwen_torchair=True) forward_kwargs = {} - if envs.VLLM_USE_V1: - output_shape = q.shape - output = torch.empty(output_shape, - dtype=q.dtype, - device=q.device) - forward_kwargs['output'] = output + output_shape = q.shape + output = torch.empty(output_shape, dtype=q.dtype, device=q.device) + forward_kwargs['output'] = output attn_output = self.attn.impl.forward(self.attn, q, diff --git a/vllm_ascend/torchair/models/qwen3_moe.py b/vllm_ascend/torchair/models/qwen3_moe.py index 0c90412c..3ea3a56f 100644 --- a/vllm_ascend/torchair/models/qwen3_moe.py +++ b/vllm_ascend/torchair/models/qwen3_moe.py @@ -19,7 +19,6 @@ from typing import Any, List, Optional, Union import torch -import vllm.envs as envs from torch import nn from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata @@ -244,12 +243,9 @@ class CustomQwen3MoeAttention(Qwen3MoeAttention): is_prefill=False, is_qwen_torchair=True) forward_kwargs = {} - if envs.VLLM_USE_V1: - output_shape = q.shape - output = torch.empty(output_shape, - dtype=q.dtype, - device=q.device) - forward_kwargs['output'] = output + output_shape = q.shape + output = torch.empty(output_shape, dtype=q.dtype, device=q.device) + forward_kwargs['output'] = output attn_output = self.attn.impl.forward(self.attn, q,