<!-- Thanks for sending a pull request! BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html --> ### What this PR does / why we need it? This PR is used for resolved [issue 1147](https://github.com/vllm-project/vllm-ascend/issues/1147) 1. Move fused_moe code into one file `fused_moe.py`. 2. Integrate branch conditions into function `get_fused_moe_state`. <!-- - Please clarify what changes you are proposing. The purpose of this section is to outline the changes and how this PR fixes the issue. If possible, please consider writing useful notes for better and faster reviews in your PR. - Please clarify why the changes are needed. For instance, the use case and bug description. - Fixes # --> ### Does this PR introduce _any_ user-facing change? 1. This PR has removed the env `VLLM_ENABLE_MC2`, because I think this env is useless, we can make judgments based on the current scenario without this env, it will only increase complexity. 2. This PR has removed the env `USING_LCCL_COM`, because this env has already expired. 3. `additional_config.expert_tensor_parallel_size` has already expired, and now we also use parameter `enable_expert_parallel`, consistent with the vLLM. <!-- Note that it means *any* user-facing change including all aspects such as API, interface or other behavior changes. Documentation-only updates are not considered user-facing changes. --> ### How was this patch tested? <!-- CI passed with new added/existing test. If it was tested in a way different from regular unit tests, please clarify how you tested step by step, ideally copy and paste-able, so that other reviewers can test and check, and descendants can verify in the future. If tests were not added, please describe why they were not added and/or why it was difficult to add. --> Signed-off-by: zzzzwwjj <1183291235@qq.com>
142 lines
6.3 KiB
Python
142 lines
6.3 KiB
Python
#
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
# This file is a part of the vllm-ascend project.
|
|
#
|
|
# This file is mainly Adapted from vllm-project/vllm/vllm/envs.py
|
|
# Copyright 2023 The vLLM team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import os
|
|
from typing import Any, Callable, Dict
|
|
|
|
# The begin-* and end* here are used by the documentation generator
|
|
# to extract the used env vars.
|
|
|
|
# begin-env-vars-definition
|
|
|
|
env_variables: Dict[str, Callable[[], Any]] = {
|
|
# max compile thread number for package building. Usually, it is set to
|
|
# the number of CPU cores. If not set, the default value is None, which
|
|
# means all number of CPU cores will be used.
|
|
"MAX_JOBS":
|
|
lambda: os.getenv("MAX_JOBS", None),
|
|
# The build type of the package. It can be one of the following values:
|
|
# Release, Debug, RelWithDebugInfo. If not set, the default value is Release.
|
|
"CMAKE_BUILD_TYPE":
|
|
lambda: os.getenv("CMAKE_BUILD_TYPE"),
|
|
# Whether to compile custom kernels. If not set, the default value is True.
|
|
# If set to False, the custom kernels will not be compiled. Please note that
|
|
# the sleep mode feature will be disabled as well if custom kernels are not
|
|
# compiled.
|
|
"COMPILE_CUSTOM_KERNELS":
|
|
lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))),
|
|
# The CXX compiler used for compiling the package. If not set, the default
|
|
# value is None, which means the system default CXX compiler will be used.
|
|
"CXX_COMPILER":
|
|
lambda: os.getenv("CXX_COMPILER", None),
|
|
# The C compiler used for compiling the package. If not set, the default
|
|
# value is None, which means the system default C compiler will be used.
|
|
"C_COMPILER":
|
|
lambda: os.getenv("C_COMPILER", None),
|
|
# Whether to enable the topk optimization. It's disabled by default for experimental support
|
|
# We'll make it enabled by default in the future.
|
|
"VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE":
|
|
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE", '0'))),
|
|
# The version of the Ascend chip. If not set, the default value is
|
|
# ASCEND910B1. It's used for package building. Please make sure that the
|
|
# version is correct.
|
|
"SOC_VERSION":
|
|
lambda: os.getenv("SOC_VERSION", "ASCEND910B1"),
|
|
# If set, vllm-ascend will print verbose logs during compilation
|
|
"VERBOSE":
|
|
lambda: bool(int(os.getenv('VERBOSE', '0'))),
|
|
# The home path for CANN toolkit. If not set, the default value is
|
|
# /usr/local/Ascend/ascend-toolkit/latest
|
|
"ASCEND_HOME_PATH":
|
|
lambda: os.getenv("ASCEND_HOME_PATH", None),
|
|
# The path for HCCN Tool, the tool will be called by disaggregated prefilling
|
|
# case.
|
|
"HCCN_PATH":
|
|
lambda: os.getenv("HCCN_PATH", "/usr/local/Ascend/driver/tools/hccn_tool"),
|
|
# The path for HCCL library, it's used by pyhccl communicator backend. If
|
|
# not set, the default value is libhccl.so。
|
|
"HCCL_SO_PATH":
|
|
# The prefill device id for disaggregated prefilling case.
|
|
lambda: os.environ.get("HCCL_SO_PATH", None),
|
|
"PROMPT_DEVICE_ID":
|
|
lambda: os.getenv("PROMPT_DEVICE_ID", None),
|
|
# The decode device id for disaggregated prefilling case.
|
|
"DECODE_DEVICE_ID":
|
|
lambda: os.getenv("DECODE_DEVICE_ID", None),
|
|
# The port number for llmdatadist communication. If not set, the default
|
|
# value is 26000.
|
|
"LLMDATADIST_COMM_PORT":
|
|
lambda: os.getenv("LLMDATADIST_COMM_PORT", "26000"),
|
|
# The wait time for llmdatadist sync cache. If not set, the default value is
|
|
# 5000ms.
|
|
"LLMDATADIST_SYNC_CACHE_WAIT_TIME":
|
|
lambda: os.getenv("LLMDATADIST_SYNC_CACHE_WAIT_TIME", "5000"),
|
|
# The version of vllm is installed. This value is used for developers who
|
|
# installed vllm from source locally. In this case, the version of vllm is
|
|
# usually changed. For example, if the version of vllm is "0.9.0", but when
|
|
# it's installed from source, the version of vllm is usually set to "0.9.1".
|
|
# In this case, developers need to set this value to "0.9.0" to make sure
|
|
# that the correct package is installed.
|
|
"VLLM_VERSION":
|
|
lambda: os.getenv("VLLM_VERSION", None),
|
|
# Whether to enable the trace recompiles from pytorch.
|
|
"VLLM_ASCEND_TRACE_RECOMPILES":
|
|
lambda: bool(int(os.getenv("VLLM_ASCEND_TRACE_RECOMPILES", '0'))),
|
|
"VLLM_ASCEND_ENABLE_DBO":
|
|
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_DBO", '0'))),
|
|
# Whether to enable the model execute time observe profile. Disable it when
|
|
# running vllm ascend in production environment.
|
|
"VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE":
|
|
lambda: bool(int(os.getenv("VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE", '0'))
|
|
),
|
|
# MOE_ALL2ALL_BUFFER:
|
|
# 0: default, normal init.
|
|
# 1: enable moe_all2all_buffer.
|
|
"MOE_ALL2ALL_BUFFER":
|
|
lambda: bool(int(os.getenv("MOE_ALL2ALL_BUFFER", '0'))),
|
|
# VLLM_ASCEND_ACL_OP_INIT_MODE:
|
|
# 0: default, normal init.
|
|
# 1: delay init until launch aclops.
|
|
# 2: forbid aclops init and launch.
|
|
# Find more details at https://gitee.com/ascend/pytorch/pulls/18094
|
|
# We set this var default to `1` in vllm-ascend to avoid segment fault when
|
|
# enable `pin_memory` while creating a tensor using `torch.tensor`.
|
|
"VLLM_ASCEND_ACL_OP_INIT_MODE":
|
|
lambda: os.getenv("VLLM_ASCEND_ACL_OP_INIT_MODE", '0'),
|
|
# Some models are optimized by vllm ascend. While in some case, e.g. rlhf
|
|
# training, the optimized model may not be suitable. In this case, set this
|
|
# value to False to disable the optimized model.
|
|
"USE_OPTIMIZED_MODEL":
|
|
lambda: bool(int(os.getenv('USE_OPTIMIZED_MODEL', '1'))),
|
|
}
|
|
|
|
# end-env-vars-definition
|
|
|
|
|
|
def __getattr__(name: str):
|
|
# lazy evaluation of environment variables
|
|
if name in env_variables:
|
|
return env_variables[name]()
|
|
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
|
|
|
|
def __dir__():
|
|
return list(env_variables.keys())
|