Remove VLLM_USE_V1 (#4086)

Drop VLLM_USE_V1 usage. This env has been removed from vLLM already. - vLLM version: v0.11.0 - vLLM main: 83f478bb19 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-11-11 15:43:39 +08:00
parent d5567680a2
commit f811a24bf0
21 changed files with 7 additions and 63 deletions
--- a/vllm_ascend/models/qwen3_next.py
+++ b/vllm_ascend/models/qwen3_next.py
@@ -9,7 +9,6 @@ import torch
 from einops import rearrange
 from torch import nn
 from transformers.activations import ACT2FN
-from vllm import envs
 from vllm.attention import AttentionBackend, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (CacheConfig, ModelConfig, SpeculativeConfig,
@@ -668,7 +667,6 @@ class CustomQwen3NextForCausalLM(Qwen3NextForCausalLM):
        scheduler_config = vllm_config.scheduler_config
        assert not cache_config.enable_prefix_caching, \
            "Qwen3Next currently does not support prefix caching"
-        assert envs.VLLM_USE_V1, "Qwen3Next requires VLLM_USE_V1"
        self.quant_config = vllm_config.quant_config
        self.config = config
        self.scheduler_config = scheduler_config
--- a/vllm_ascend/patch/platform/patch_config.py
+++ b/vllm_ascend/patch/platform/patch_config.py
@@ -1,6 +1,5 @@
 import ast

-import vllm.envs as envs
 from vllm.config.speculative import SpeculativeConfig
 from vllm.logger import logger

@@ -163,11 +162,6 @@ def __post_init__(self):

            # Replace hf_config for EAGLE draft_model
            if self.method in ("eagle", "eagle3"):
-                if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
-                    raise ValueError(
-                        "Chunked prefill and EAGLE are not compatible "
-                        "when using V0.")
-
                from vllm.transformers_utils.configs import SpeculatorsConfig
                from vllm.transformers_utils.configs.eagle import EAGLEConfig

--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -20,7 +20,6 @@ import os
 from typing import TYPE_CHECKING, Optional, Tuple

 import torch
-import vllm.envs as envs_vllm
 from vllm.logger import logger
 from vllm.platforms import Platform, PlatformEnum

@@ -117,8 +116,6 @@ class NPUPlatform(Platform):

    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        if not envs_vllm.VLLM_USE_V1:
-            raise ValueError("vLLM Ascend does not support V0 engine.")
        # initialize ascend config from vllm additional_config
        ascend_config = init_ascend_config(vllm_config)

--- a/vllm_ascend/torchair/models/qwen2.py
+++ b/vllm_ascend/torchair/models/qwen2.py
@@ -21,7 +21,6 @@ from typing import Any, List, Optional, Union
 import torch
 import torch.nn.functional as F
 import vllm
-import vllm.envs as envs
 from torch import nn
 from transformers import Qwen2Config
 from vllm.attention import AttentionMetadata, AttentionType
@@ -112,12 +111,9 @@ class CustomQwen2Attention(Qwen2Attention):
                                   is_prefill=False,
                                   is_qwen_torchair=True)
            forward_kwargs = {}
-            if envs.VLLM_USE_V1:
-                output_shape = q.shape
-                output = torch.empty(output_shape,
-                                     dtype=q.dtype,
-                                     device=q.device)
-                forward_kwargs['output'] = output
+            output_shape = q.shape
+            output = torch.empty(output_shape, dtype=q.dtype, device=q.device)
+            forward_kwargs['output'] = output

            attn_output = self.attn.impl.forward(self.attn,
                                                 q,
--- a/vllm_ascend/torchair/models/qwen3_moe.py
+++ b/vllm_ascend/torchair/models/qwen3_moe.py
@@ -19,7 +19,6 @@
 from typing import Any, List, Optional, Union

 import torch
-import vllm.envs as envs
 from torch import nn
 from transformers import PretrainedConfig
 from vllm.attention import Attention, AttentionMetadata
@@ -244,12 +243,9 @@ class CustomQwen3MoeAttention(Qwen3MoeAttention):
                                   is_prefill=False,
                                   is_qwen_torchair=True)
            forward_kwargs = {}
-            if envs.VLLM_USE_V1:
-                output_shape = q.shape
-                output = torch.empty(output_shape,
-                                     dtype=q.dtype,
-                                     device=q.device)
-                forward_kwargs['output'] = output
+            output_shape = q.shape
+            output = torch.empty(output_shape, dtype=q.dtype, device=q.device)
+            forward_kwargs['output'] = output

            attn_output = self.attn.impl.forward(self.attn,
                                                 q,