[Misc] Remove VLLM_USE_V1 usage in code (#1764)

We plan to remove V0 code from this version. The first step is to delete v0 usage. Related: https://github.com/vllm-project/vllm-ascend/issues/1620 - vLLM version: v0.9.2 - vLLM main: 61e20828da Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-07-15 11:52:16 +08:00
parent 494b0f474f
commit 7bdada58eb
6 changed files with 100 additions and 217 deletions
--- a/vllm_ascend/models/deepseek_v2.py
+++ b/vllm_ascend/models/deepseek_v2.py
@@ -29,7 +29,6 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union

 import torch
 import torch_npu
-import vllm.envs as envs
 from torch import nn
 from transformers import PretrainedConfig
 from vllm.attention import Attention, AttentionMetadata
@@ -579,20 +578,17 @@ class CustomDeepseekV2MLAAttention(DeepseekV2MLAAttention):
        else:
            hidden_states_or_q_c = hidden_states
        if self.torchair_graph_enabled:
-            if envs.VLLM_USE_V1:
-                output_shape = hidden_states.shape
-                output = torch.empty(output_shape,
-                                     dtype=hidden_states_or_q_c.dtype,
-                                     device=hidden_states_or_q_c.device)
-                forward_kwargs['output'] = output
-
+            output_shape = hidden_states.shape
+            output = torch.empty(output_shape,
+                                 dtype=hidden_states_or_q_c.dtype,
+                                 device=hidden_states_or_q_c.device)
+            forward_kwargs['output'] = output
            output = self.mla_attn.impl.forward(self.mla_attn,
                                                hidden_states_or_q_c,
                                                hidden_states, None, kv_cache,
                                                attn_metadata,
                                                **forward_kwargs)
-            if envs.VLLM_USE_V1:
-                output = output.view(-1, output_shape[-1])
+            output = output.view(-1, output_shape[-1])
            return output
        else:
            kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
@@ -660,7 +656,7 @@ class CustomDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):
                prefix=f"{prefix}.mlp",
            )
            self.mla_moe_communication = ascend_config.torchair_graph_config.enable_multistream_moe \
-                and model_config.use_mla and envs.VLLM_USE_V1 and self.tp_size > 1
+                and model_config.use_mla and self.tp_size > 1
        else:
            self.mlp = CustomDeepseekV2MLP(
                hidden_size=config.hidden_size,