2025-04-15 16:09:36 +08:00
|
|
|
#
|
|
|
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
|
|
|
# This file is a part of the vllm-ascend project.
|
|
|
|
|
#
|
|
|
|
|
# This file is mainly Adapted from vllm-project/vllm/vllm/envs.py
|
|
|
|
|
# Copyright 2023 The vLLM team.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
#
|
|
|
|
|
|
2025-04-03 14:52:34 +08:00
|
|
|
import os
|
2026-01-19 08:59:46 +08:00
|
|
|
from collections.abc import Callable
|
|
|
|
|
from typing import Any
|
2025-04-03 14:52:34 +08:00
|
|
|
|
2025-04-15 16:09:36 +08:00
|
|
|
# The begin-* and end* here are used by the documentation generator
|
|
|
|
|
# to extract the used env vars.
|
|
|
|
|
|
|
|
|
|
# begin-env-vars-definition
|
|
|
|
|
|
2026-01-19 08:59:46 +08:00
|
|
|
env_variables: dict[str, Callable[[], Any]] = {
|
2025-06-06 09:48:43 +08:00
|
|
|
# max compile thread number for package building. Usually, it is set to
|
|
|
|
|
# the number of CPU cores. If not set, the default value is None, which
|
|
|
|
|
# means all number of CPU cores will be used.
|
2026-01-19 08:59:46 +08:00
|
|
|
"MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The build type of the package. It can be one of the following values:
|
|
|
|
|
# Release, Debug, RelWithDebugInfo. If not set, the default value is Release.
|
2026-01-19 08:59:46 +08:00
|
|
|
"CMAKE_BUILD_TYPE": lambda: os.getenv("CMAKE_BUILD_TYPE"),
|
2026-02-06 10:28:42 +08:00
|
|
|
# Whether to compile custom kernels. If not set, the default value is True.
|
|
|
|
|
# If set to False, the custom kernels will not be compiled.
|
|
|
|
|
# This configuration option should only be set to False when running UT
|
|
|
|
|
# scenarios in an environment without an NPU. Do not set it to False in
|
|
|
|
|
# other scenarios.
|
|
|
|
|
"COMPILE_CUSTOM_KERNELS": lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The CXX compiler used for compiling the package. If not set, the default
|
|
|
|
|
# value is None, which means the system default CXX compiler will be used.
|
2026-01-19 08:59:46 +08:00
|
|
|
"CXX_COMPILER": lambda: os.getenv("CXX_COMPILER", None),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The C compiler used for compiling the package. If not set, the default
|
|
|
|
|
# value is None, which means the system default C compiler will be used.
|
2026-01-19 08:59:46 +08:00
|
|
|
"C_COMPILER": lambda: os.getenv("C_COMPILER", None),
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
# The version of the Ascend chip. It's used for package building.
|
|
|
|
|
# If not set, we will query chip info through `npu-smi`.
|
2025-07-17 11:13:02 +08:00
|
|
|
# Please make sure that the version is correct.
|
2026-01-19 08:59:46 +08:00
|
|
|
"SOC_VERSION": lambda: os.getenv("SOC_VERSION", None),
|
2025-04-12 10:24:53 +08:00
|
|
|
# If set, vllm-ascend will print verbose logs during compilation
|
2026-01-19 08:59:46 +08:00
|
|
|
"VERBOSE": lambda: bool(int(os.getenv("VERBOSE", "0"))),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The home path for CANN toolkit. If not set, the default value is
|
|
|
|
|
# /usr/local/Ascend/ascend-toolkit/latest
|
2026-01-19 08:59:46 +08:00
|
|
|
"ASCEND_HOME_PATH": lambda: os.getenv("ASCEND_HOME_PATH", None),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The path for HCCL library, it's used by pyhccl communicator backend. If
|
2025-11-08 18:48:59 +08:00
|
|
|
# not set, the default value is libhccl.so.
|
2026-02-09 15:38:58 +08:00
|
|
|
"HCCL_SO_PATH": lambda: os.getenv("HCCL_SO_PATH", None),
|
2025-06-06 09:48:43 +08:00
|
|
|
# The version of vllm is installed. This value is used for developers who
|
|
|
|
|
# installed vllm from source locally. In this case, the version of vllm is
|
|
|
|
|
# usually changed. For example, if the version of vllm is "0.9.0", but when
|
|
|
|
|
# it's installed from source, the version of vllm is usually set to "0.9.1".
|
|
|
|
|
# In this case, developers need to set this value to "0.9.0" to make sure
|
|
|
|
|
# that the correct package is installed.
|
2026-01-19 08:59:46 +08:00
|
|
|
"VLLM_VERSION": lambda: os.getenv("VLLM_VERSION", None),
|
2025-07-28 15:13:37 +08:00
|
|
|
# Whether to enable MatmulAllReduce fusion kernel when tensor parallel is enabled.
|
|
|
|
|
# this feature is supported in A2, and eager mode will get better performance.
|
2026-01-19 08:59:46 +08:00
|
|
|
"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE", "0"))),
|
2025-09-08 22:52:24 +08:00
|
|
|
# Whether to enable FlashComm optimization when tensor parallel is enabled.
|
|
|
|
|
# This feature will get better performance when concurrency is large.
|
2026-01-19 08:59:46 +08:00
|
|
|
"VLLM_ASCEND_ENABLE_FLASHCOMM1": lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM1", "0"))),
|
2025-11-10 11:01:45 +08:00
|
|
|
# Whether to enable FLASHCOMM2. Setting it to 0 disables the feature, while setting it to 1 or above enables it.
|
|
|
|
|
# The specific value set will be used as the O-matrix TP group size for flashcomm2.
|
|
|
|
|
# For a detailed introduction to the parameters and the differences and applicable scenarios
|
|
|
|
|
# between this feature and FLASHCOMM1, please refer to the feature guide in the documentation.
|
2026-01-19 08:59:46 +08:00
|
|
|
"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": lambda: int(os.getenv("VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE", 0)),
|
2025-09-25 14:15:02 +08:00
|
|
|
# Whether to enable msMonitor tool to monitor the performance of vllm-ascend.
|
2026-01-19 08:59:46 +08:00
|
|
|
"MSMONITOR_USE_DAEMON": lambda: bool(int(os.getenv("MSMONITOR_USE_DAEMON", "0"))),
|
2026-01-22 09:26:39 +08:00
|
|
|
# Whether to enable MLAPO optimization for DeepSeek W8A8 series models.
|
|
|
|
|
# This option is enabled by default. MLAPO can improve performance, but
|
|
|
|
|
# it will consume more NPU memory. If reducing NPU memory usage is a higher priority
|
|
|
|
|
# for your DeepSeek W8A8 scene, then disable it.
|
|
|
|
|
"VLLM_ASCEND_ENABLE_MLAPO": lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLAPO", "1"))),
|
2025-12-19 14:27:24 +08:00
|
|
|
# Whether to enable weight cast format to FRACTAL_NZ.
|
|
|
|
|
# 0: close nz;
|
|
|
|
|
# 1: only quant case enable nz;
|
|
|
|
|
# 2: enable nz as long as possible.
|
2026-01-19 08:59:46 +08:00
|
|
|
"VLLM_ASCEND_ENABLE_NZ": lambda: int(os.getenv("VLLM_ASCEND_ENABLE_NZ", 1)),
|
2025-10-24 10:32:01 +08:00
|
|
|
# Decide whether we should enable CP parallelism.
|
2026-01-19 08:59:46 +08:00
|
|
|
"VLLM_ASCEND_ENABLE_CONTEXT_PARALLEL": lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_CONTEXT_PARALLEL", "0"))),
|
2025-11-19 21:31:58 +08:00
|
|
|
# Whether to anbale dynamic EPLB
|
2026-01-19 08:59:46 +08:00
|
|
|
"DYNAMIC_EPLB": lambda: os.getenv("DYNAMIC_EPLB", "false").lower(),
|
2025-12-21 15:23:59 +08:00
|
|
|
# Whether to enable fused mc2(`dispatch_gmm_combine_decode`/`dispatch_ffn_combine` operator)
|
|
|
|
|
# 0, or not set: default ALLTOALL and MC2 will be used.
|
|
|
|
|
# 1: ALLTOALL and MC2 might be replaced by `dispatch_ffn_combine` operator.
|
2026-01-13 14:35:52 +08:00
|
|
|
# `dispatch_ffn_combine` can be used only for moe layer with W8A8, EP<=32, non-mtp, non-dynamic-eplb.
|
2025-12-21 15:23:59 +08:00
|
|
|
# 2: MC2 might be replaced by `dispatch_gmm_combine_decode` operator.
|
|
|
|
|
# `dispatch_gmm_combine_decode` can be used only for **decode node** moe layer
|
2026-01-07 11:23:42 +08:00
|
|
|
# with W8A8. And MTP layer must be W8A8.
|
2026-01-19 08:59:46 +08:00
|
|
|
"VLLM_ASCEND_ENABLE_FUSED_MC2": lambda: int(os.getenv("VLLM_ASCEND_ENABLE_FUSED_MC2", "0")),
|
[Main] [Patch] support balance scheduling patch (#5212)
### Motivation.
**Limitations of the current vLLM v1 scheduling strategy**
vLLM v1 scheduling currently enables chunkedprefill by default, which
processes prefill and decode requests simultaneously in a single
scheduling session. This can impact the overall system throughput and
performance in some scenarios.
Balance scheduling addresses this issue by synchronizing the number of
running queues across all schedulers to delay the scheduling of new
requests, thereby improving the overall system's steady-state decoding
time. This achieves:
✅Adding `balance_gather` to the scheduler synchronizes the number of
requests in the running queues between DPs.
✅Balance scheduling improves the decode steady-state time, thereby
increasing the overall output throughput of the inference system.
### Proposed Change.
**1.Feature Overview**
In the vLLM scheduler, running requests (i.e., requests that are already
undergoing pre-filled computation) have the highest priority, followed
by waiting requests (i.e., requests that have not yet been computed).
As shown in the diagram above, when the entire inference system exits
from a steady state, the scheduler will schedule a batch of new requests
for prefill operations and then synchronize them among the dynamic
programming (DP) models. This can cause some DP models that are entirely
decoded to synchronize with the number of prefilled tokens. Frequent
prefill scheduling by certain DP models can lead to a deterioration in
the overall system output throughput.
Balance scheduling synchronizes the number of running queue requests
across different DPs, and only schedules new requests for prefilling
when at least every scheduler has fewer than max_nun_requst.
**2.Implementation Design**
**3.Experiment Results**
- Fixed-length input scenario: In the performance test scenario with
3.5K fixed-length input and 1.5K fixed-length output, the throughput
performance was improved by approximately **18%** after adding balance
scheduling.
| Method | Model | Input Len | Request Count | Output Len | BatchSize |
Average TTFT | Average TPOT | e2e duration | Input Token Throughput |
Output Token Throughput | Request Throughput
| ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- |
---- | ---- |
| Baseline | DeepSeekV3.1 | 3500 | 512 | 1500 | 128 | 6600 | 86.85 |
591.9s | 3030.5 | 1297.3 | 0.86 |
| Balance scheduling | DeepSeekV3.1 | 3500 | 512 | 1500 | 128 | 7012 |
70.63 | 501.7s | 3575.7 | 1530.7 | 1.02 |
**4.Demo PR**
[#29721 ](https://github.com/vllm-project/vllm/pull/29721)
---------
Signed-off-by: GDzhu01 <809721801@qq.com>
2025-12-23 09:04:38 +08:00
|
|
|
# Whether to anbale balance scheduling
|
2026-01-19 08:59:46 +08:00
|
|
|
"VLLM_ASCEND_BALANCE_SCHEDULING": lambda: bool(int(os.getenv("VLLM_ASCEND_BALANCE_SCHEDULING", "0"))),
|
[Kernel] Add AscendC fused op transpose_kv_cache_by_block to speed up GQA transfer (#6366)
### What this PR does / why we need it?
As #2947 describe, we need to transpose kv cache layout after GQA kv
transfer when prefill and decode tensor parallel size are heterogeneous,
in the previous implementation, we use `npu_paged_cache_load ` +
`tranpose` + `_npu_reshape_and_cache` to do this work.
But obviously, it is not an efficient plan, the ops above need to be
called for each layer, which introduces 3 * layer_num kernel launch, and
6 * layer_num data movement between L1 Cache and HBM for one request on
decode node. Usually, decode node uses graph mode, so these op kernels
will be called between decode forward launched by an async thread in
mooncacke connector, this kernels maybe last for several decode forward
and TTFT will increase by 3~4 decode forward time.
In this PR, we implement an AscendC fused op
`transpose_kv_cache_by_block` to do this with only once kernel launch
and move data between L1 Cache and HBM only once.
After using this fused op, the time cost in transpose kv cacke layout
can be decreased to 0.24ms from 7ms in UT on 910C, and in PD
disaggregation scenario, TTFT can decrease about 90 ~ 110 ms in
qwen3-235B.
| request_num | original | fused_op|
|:----------------------:|:---------------:|:-------------------:|
| 1 | 643 ms | 578 ms |
| 128 | 1480 ms | 1368 ms |
### Does this PR introduce _any_ user-facing change?
Use fused op by default, incase the op has bug in any scenario, provide
fallback choice using env to disable it.
**DISABLE fused op by add following env**
`export VLLM_ASCEND_FUSION_OP_TRANSPOSE_KV_CACHE_BY_BLOCK=0`
### How was this patch tested?
- vLLM version: v0.14.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd
---------
Signed-off-by: lidenghui <lidenghui1110@gmail.com>
2026-02-03 14:10:01 +08:00
|
|
|
# use fused op transpose_kv_cache_by_block, default is True
|
|
|
|
|
"VLLM_ASCEND_FUSION_OP_TRANSPOSE_KV_CACHE_BY_BLOCK": lambda: bool(
|
|
|
|
|
int(os.getenv("VLLM_ASCEND_FUSION_OP_TRANSPOSE_KV_CACHE_BY_BLOCK", "1"))
|
|
|
|
|
),
|
2026-04-21 03:05:32 +00:00
|
|
|
"VLLM_ASCEND_ENABLE_VNPU": lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_VNPU", 1))),
|
2025-04-03 14:52:34 +08:00
|
|
|
}
|
|
|
|
|
|
2025-04-15 16:09:36 +08:00
|
|
|
# end-env-vars-definition
|
|
|
|
|
|
2025-04-03 14:52:34 +08:00
|
|
|
|
|
|
|
|
def __getattr__(name: str):
|
|
|
|
|
# lazy evaluation of environment variables
|
|
|
|
|
if name in env_variables:
|
|
|
|
|
return env_variables[name]()
|
|
|
|
|
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __dir__():
|
2025-11-10 11:01:45 +08:00
|
|
|
return list(env_variables.keys())
|