2025-04-15 16:09:36 +08:00
|
|
|
#
|
|
|
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
|
|
|
# This file is a part of the vllm-ascend project.
|
|
|
|
|
#
|
|
|
|
|
# This file is mainly Adapted from vllm-project/vllm/vllm/envs.py
|
|
|
|
|
# Copyright 2023 The vLLM team.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
#
|
|
|
|
|
|
2025-04-03 14:52:34 +08:00
|
|
|
import os
|
|
|
|
|
from typing import Any, Callable, Dict
|
|
|
|
|
|
2025-04-15 16:09:36 +08:00
|
|
|
# The begin-* and end* here are used by the documentation generator
|
|
|
|
|
# to extract the used env vars.
|
|
|
|
|
|
|
|
|
|
# begin-env-vars-definition
|
|
|
|
|
|
2025-04-03 14:52:34 +08:00
|
|
|
env_variables: Dict[str, Callable[[], Any]] = {
|
2025-06-06 09:48:43 +08:00
|
|
|
# max compile thread number for package building. Usually, it is set to
|
|
|
|
|
# the number of CPU cores. If not set, the default value is None, which
|
|
|
|
|
# means all number of CPU cores will be used.
|
2025-04-12 10:24:53 +08:00
|
|
|
"MAX_JOBS":
|
|
|
|
|
lambda: os.getenv("MAX_JOBS", None),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The build type of the package. It can be one of the following values:
|
|
|
|
|
# Release, Debug, RelWithDebugInfo. If not set, the default value is Release.
|
2025-04-12 10:24:53 +08:00
|
|
|
"CMAKE_BUILD_TYPE":
|
|
|
|
|
lambda: os.getenv("CMAKE_BUILD_TYPE"),
|
2025-06-06 09:48:43 +08:00
|
|
|
# Whether to compile custom kernels. If not set, the default value is True.
|
|
|
|
|
# If set to False, the custom kernels will not be compiled. Please note that
|
|
|
|
|
# the sleep mode feature will be disabled as well if custom kernels are not
|
|
|
|
|
# compiled.
|
2025-04-03 14:52:34 +08:00
|
|
|
"COMPILE_CUSTOM_KERNELS":
|
2025-04-12 10:24:53 +08:00
|
|
|
lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The CXX compiler used for compiling the package. If not set, the default
|
|
|
|
|
# value is None, which means the system default CXX compiler will be used.
|
|
|
|
|
"CXX_COMPILER":
|
|
|
|
|
lambda: os.getenv("CXX_COMPILER", None),
|
|
|
|
|
# The C compiler used for compiling the package. If not set, the default
|
|
|
|
|
# value is None, which means the system default C compiler will be used.
|
|
|
|
|
"C_COMPILER":
|
|
|
|
|
lambda: os.getenv("C_COMPILER", None),
|
|
|
|
|
# The version of the Ascend chip. If not set, the default value is
|
2025-07-17 11:13:02 +08:00
|
|
|
# ASCEND910B1(Available for A2 and A3 series). It's used for package building.
|
|
|
|
|
# Please make sure that the version is correct.
|
2025-04-12 10:24:53 +08:00
|
|
|
"SOC_VERSION":
|
|
|
|
|
lambda: os.getenv("SOC_VERSION", "ASCEND910B1"),
|
|
|
|
|
# If set, vllm-ascend will print verbose logs during compilation
|
|
|
|
|
"VERBOSE":
|
|
|
|
|
lambda: bool(int(os.getenv('VERBOSE', '0'))),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The home path for CANN toolkit. If not set, the default value is
|
|
|
|
|
# /usr/local/Ascend/ascend-toolkit/latest
|
2025-04-12 10:24:53 +08:00
|
|
|
"ASCEND_HOME_PATH":
|
|
|
|
|
lambda: os.getenv("ASCEND_HOME_PATH", None),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The path for HCCL library, it's used by pyhccl communicator backend. If
|
|
|
|
|
# not set, the default value is libhccl.so。
|
2025-04-17 14:57:52 +08:00
|
|
|
"HCCL_SO_PATH":
|
|
|
|
|
lambda: os.environ.get("HCCL_SO_PATH", None),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The version of vllm is installed. This value is used for developers who
|
|
|
|
|
# installed vllm from source locally. In this case, the version of vllm is
|
|
|
|
|
# usually changed. For example, if the version of vllm is "0.9.0", but when
|
|
|
|
|
# it's installed from source, the version of vllm is usually set to "0.9.1".
|
|
|
|
|
# In this case, developers need to set this value to "0.9.0" to make sure
|
|
|
|
|
# that the correct package is installed.
|
2025-04-28 14:19:06 +08:00
|
|
|
"VLLM_VERSION":
|
|
|
|
|
lambda: os.getenv("VLLM_VERSION", None),
|
2025-06-06 09:48:43 +08:00
|
|
|
# Whether to enable the trace recompiles from pytorch.
|
2025-05-31 06:03:03 +08:00
|
|
|
"VLLM_ASCEND_TRACE_RECOMPILES":
|
|
|
|
|
lambda: bool(int(os.getenv("VLLM_ASCEND_TRACE_RECOMPILES", '0'))),
|
2025-06-23 22:03:38 +08:00
|
|
|
# Whether to enable fused_experts_allgather_ep. MoeInitRoutingV3 and
|
|
|
|
|
# GroupedMatmulFinalizeRouting operators are combined to implement EP.
|
|
|
|
|
"VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP":
|
|
|
|
|
lambda: bool(int(os.getenv("VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP", '0'))
|
|
|
|
|
),
|
2025-08-20 10:50:21 +08:00
|
|
|
# Whether to enable DBO feature for deepseek model.
|
2025-06-07 16:46:58 +08:00
|
|
|
"VLLM_ASCEND_ENABLE_DBO":
|
|
|
|
|
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_DBO", '0'))),
|
2025-06-06 09:48:43 +08:00
|
|
|
# Whether to enable the model execute time observe profile. Disable it when
|
|
|
|
|
# running vllm ascend in production environment.
|
2025-06-06 09:29:34 +08:00
|
|
|
"VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE":
|
|
|
|
|
lambda: bool(int(os.getenv("VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE", '0'))
|
|
|
|
|
),
|
[CI]Moe alltoall communication optimization (#1067)
[CI]Moe alltoall communication optimization
The DeepSeek V3/R1 model has 256 routing experts. During parallel
inference, if the load of an EP rank is high, the overall communication
and computing time is slowed down, which becomes a weakness of parallel
inference because the load is unevenly distributed. However, the data
volume in the prefill phase is large, and the inter-card communication
time consumption/calculation time consumption and the data volume are
closely related to each other. Therefore, less non-linear precision loss
can be used to obtain a near-linear performance improvement.
During parallel inference, global synchronization occurs during
communication. As a result, the card with low load completes the
calculation first and waits for the card with the highest load to
complete the calculation. Therefore, if the load is unbalanced, the card
with high load slows down the overall time consumption. Significant
performance gains can be achieved by discarding a small number of
tokens, which is unacceptable in some precision-sensitive scenarios.
However, similar to quantification, it is a solution that uses an
acceptable precision loss in some scenarios for performance. In
addition, a trade-off between performance and precision can be achieved
by configuring a proportion of discarded tokens.
Perform the test on A3. The batch size is 8 (B), the prompt length is
3.5K tokens (S), and the parallel configuration is as follows: AttnDP=2,
AttnTP=8, MoeTP=1, and MoeEP=16. In this sence, we got a 10%-15%
performance gain.
Plus, the next version, we'll have an alltoallv moe.
---------
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
2025-06-07 10:15:56 +08:00
|
|
|
# MOE_ALL2ALL_BUFFER:
|
|
|
|
|
# 0: default, normal init.
|
|
|
|
|
# 1: enable moe_all2all_buffer.
|
|
|
|
|
"MOE_ALL2ALL_BUFFER":
|
|
|
|
|
lambda: bool(int(os.getenv("MOE_ALL2ALL_BUFFER", '0'))),
|
2025-06-07 19:45:46 +08:00
|
|
|
# Some models are optimized by vllm ascend. While in some case, e.g. rlhf
|
|
|
|
|
# training, the optimized model may not be suitable. In this case, set this
|
|
|
|
|
# value to False to disable the optimized model.
|
|
|
|
|
"USE_OPTIMIZED_MODEL":
|
|
|
|
|
lambda: bool(int(os.getenv('USE_OPTIMIZED_MODEL', '1'))),
|
2025-07-07 22:37:14 +08:00
|
|
|
# The tolerance of the kv cache size, if the difference between the
|
|
|
|
|
# actual kv cache size and the cached kv cache size is less than this value,
|
|
|
|
|
# then the cached kv cache size will be used.
|
|
|
|
|
"VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE":
|
|
|
|
|
lambda: int(
|
|
|
|
|
os.getenv("VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE", 64)),
|
2025-07-30 08:47:22 +08:00
|
|
|
# Whether to enable the topk optimization. It's enabled by default. Please set to False if you hit any issue.
|
|
|
|
|
# We'll remove this flag in the future once it's stable enough.
|
2025-07-11 15:32:02 +08:00
|
|
|
"VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION":
|
|
|
|
|
lambda: bool(
|
2025-07-30 08:47:22 +08:00
|
|
|
int(os.getenv("VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION", '1'))),
|
2025-07-26 17:15:47 +08:00
|
|
|
# `LLMDataDistCMgrConnector` required variable. `DISAGGREGATED_PREFILL_RANK_TABLE_PATH` is
|
|
|
|
|
# used for llmdatadist to build the communication topology for kv cache transfer, it is
|
|
|
|
|
# a required variable if `LLMDataDistCMgrConnector` is used as kv connector for disaggregated
|
|
|
|
|
# pd. The rank table can be generated by adopting the script `gen_ranktable.sh`
|
|
|
|
|
# in vllm_ascend's example folder.
|
|
|
|
|
"DISAGGREGATED_PREFILL_RANK_TABLE_PATH":
|
|
|
|
|
lambda: os.getenv("DISAGGREGATED_PREFILL_RANK_TABLE_PATH", None),
|
|
|
|
|
# `LLMDataDistCMgrConnector` required variable. `VLLM_ASCEND_LLMDD_RPC_IP` is used as the
|
|
|
|
|
# rpc communication listening ip, which will be used to receive the agent metadata from the
|
|
|
|
|
# remote worker.
|
|
|
|
|
"VLLM_ASCEND_LLMDD_RPC_IP":
|
|
|
|
|
lambda: os.getenv("VLLM_ASCEND_LLMDD_RPC_IP", "0.0.0.0"),
|
2025-08-20 11:39:05 +08:00
|
|
|
# `LLMDataDistCMgrConnector` required variable. `VLLM_ASCEND_LLMDD_RPC_PORT` is used as the
|
2025-07-26 17:15:47 +08:00
|
|
|
# rpc communication listening port, which will be used to receive the agent metadata from the
|
|
|
|
|
# remote worker.
|
2025-08-20 11:39:05 +08:00
|
|
|
"VLLM_ASCEND_LLMDD_RPC_PORT":
|
|
|
|
|
lambda: int(os.getenv("VLLM_ASCEND_LLMDD_RPC_PORT", 5557)),
|
2025-07-26 17:15:47 +08:00
|
|
|
# Whether to enable mla_pa for deepseek mla decode, this flag will be removed after its available torch_npu is public accessible
|
|
|
|
|
# and the mla_pa will be the default path of deepseek decode path.
|
|
|
|
|
"VLLM_ASCEND_MLA_PA":
|
2025-07-28 15:13:37 +08:00
|
|
|
lambda: int(os.getenv("VLLM_ASCEND_MLA_PA", 0)),
|
|
|
|
|
# Whether to enable MatmulAllReduce fusion kernel when tensor parallel is enabled.
|
|
|
|
|
# this feature is supported in A2, and eager mode will get better performance.
|
|
|
|
|
"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE":
|
|
|
|
|
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE", '0'))),
|
2025-08-02 09:49:10 +08:00
|
|
|
# Whether to enable the alltoall_seq flag, this provides a basic framework on the basis of alltoall for easy expansion.
|
|
|
|
|
# 0: default, normal init.
|
|
|
|
|
# 1: enable moe all2all seq.
|
|
|
|
|
"VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ":
|
|
|
|
|
lambda: bool(int(os.getenv('VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ', '0'))),
|
2025-08-21 09:22:07 +08:00
|
|
|
# Whether to enable mlp optimize when tensor parallel is enabled.
|
|
|
|
|
# this feature in eager mode will get better performance.
|
|
|
|
|
"VLLM_ASCEND_ENABLE_MLP_OPTIMIZE":
|
|
|
|
|
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLP_OPTIMIZE", '0'))),
|
2025-04-03 14:52:34 +08:00
|
|
|
}
|
|
|
|
|
|
2025-04-15 16:09:36 +08:00
|
|
|
# end-env-vars-definition
|
|
|
|
|
|
2025-04-03 14:52:34 +08:00
|
|
|
|
|
|
|
|
def __getattr__(name: str):
|
|
|
|
|
# lazy evaluation of environment variables
|
|
|
|
|
if name in env_variables:
|
|
|
|
|
return env_variables[name]()
|
|
|
|
|
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __dir__():
|
|
|
|
|
return list(env_variables.keys())
|