[MISC] Clean up useless env USE_OPTIMIZED_MODEL (#6618)

Clean up uesless env `USE_OPTIMIZED_MODEL` - vLLM version: v0.15.0 - vLLM main: d7e17aaacd Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-02-09 15:38:58 +08:00
parent b7aa511daa
commit 9c6d031797
2 changed files with 2 additions and 8 deletions
--- a/vllm_ascend/envs.py
+++ b/vllm_ascend/envs.py
@@ -58,7 +58,7 @@ env_variables: dict[str, Callable[[], Any]] = {
    "ASCEND_HOME_PATH": lambda: os.getenv("ASCEND_HOME_PATH", None),
    # The path for HCCL library, it's used by pyhccl communicator backend. If
    # not set, the default value is libhccl.so.
-    "HCCL_SO_PATH": lambda: os.environ.get("HCCL_SO_PATH", None),
+    "HCCL_SO_PATH": lambda: os.getenv("HCCL_SO_PATH", None),
    # The version of vllm is installed. This value is used for developers who
    # installed vllm from source locally. In this case, the version of vllm is
    # usually changed. For example, if the version of vllm is "0.9.0", but when
@@ -66,10 +66,6 @@ env_variables: dict[str, Callable[[], Any]] = {
    # In this case, developers need to set this value to "0.9.0" to make sure
    # that the correct package is installed.
    "VLLM_VERSION": lambda: os.getenv("VLLM_VERSION", None),
-    # Some models are optimized by vllm ascend. While in some case, e.g. rlhf
-    # training, the optimized model may not be suitable. In this case, set this
-    # value to False to disable the optimized model.
-    "USE_OPTIMIZED_MODEL": lambda: bool(int(os.getenv("USE_OPTIMIZED_MODEL", "1"))),
    # Whether to enable MatmulAllReduce fusion kernel when tensor parallel is enabled.
    # this feature is supported in A2, and eager mode will get better performance.
    "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE", "0"))),
--- a/vllm_ascend/ops/mm_encoder_attention.py
+++ b/vllm_ascend/ops/mm_encoder_attention.py
@@ -21,8 +21,6 @@ import torch.nn.functional as F
 import torch_npu
 from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention  # type: ignore

-import vllm_ascend.envs as envs_ascend
-
 MIN_PAD_SIZE = 64  # min_size to pad weight
 MAX_PAD_SIZE = 128  # max_size to pad weight

@@ -93,7 +91,7 @@ class AscendMMEncoderAttention(MMEncoderAttention):
        # q, k, v: [b, s, head, head_dim] -> [b * s, head, head_dim]
        q, k, v = self.reshape_qkv_to_3d(query, key, value, bsz, q_len, kv_len)

-        enable_pad = envs_ascend.USE_OPTIMIZED_MODEL and self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE
+        enable_pad = self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE

        if enable_pad:
            origin_shape = q.shape[-1]