Files
xc-llm-ascend/vllm_ascend/ascend_config.py

350 lines
15 KiB
Python
Raw Normal View History

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[EPLB]Eplb Config Renaming (#5533) ### What this PR does / why we need it? 1. Rename num_iterations_eplb_update to expert_heat_collection_interval. 2. Rename num_wait_worker_iterations to algorithm_execution_interval. 3. Rename init_redundancy_expert to num_redundant_experts because the variable with the same meaning in vLLM is named this way. 4. Delete gate_eplb because we don't need this feature. 5. Move eplb config into a dict in additional config. 6. Depend on pr5817 ### Does this PR introduce _any_ user-facing change? before this pr: `--additional-config '{"dynamic_eplb":true, "num_iterations_eplb_update": 4000, "num_wait_worker_iterations": 150, "init_redundancy_expert": 16, "expert_map_path": "xxx.json"}'` after this pr: `--additional-config '{"eplb_config":{"dynamic_eplb":true,"expert_heat_collection_interval":4000, "algorithm_execution_interval":150,"num_redundant_experts": 16, "expert_map_path": "xxx.json"}}'` ### How was this patch tested? #### test qwen3-235b eplb num_redundant_experts=16 without pr5817 | dataset | version | metric | mode | vllm-api-general-chat | |----- | ----- | ----- | ----- | -----| | aime2024 | 604a78 | accuracy | gen | 83.33 | with pr5817 | dataset | version | metric | mode | vllm-api-general-chat | |----- | ----- | ----- | ----- | -----| | aime2024 | 604a78 | accuracy | gen | 86.67 | - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/45c1ca1ca1ee8fa06df263c8715e8a412ff408d4 Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2026-01-15 10:26:44 +08:00
import os
from typing import TYPE_CHECKING, Optional
from vllm.logger import logger
from vllm.triton_utils import HAS_TRITON
if TYPE_CHECKING:
from vllm.config import VllmConfig
class AscendConfig:
"""
Configuration Object for additional_config from vllm.configs.
"""
def __init__(self, vllm_config: "VllmConfig"):
additional_config = vllm_config.additional_config if vllm_config.additional_config is not None else {}
xlite_graph_config = additional_config.get("xlite_graph_config", {})
self.xlite_graph_config = XliteGraphConfig(xlite_graph_config,
vllm_config)
Adopt inductor fusion and define quantization fusion pass (#4168) ### What this PR does / why we need it? The main goal of this PR to alleviate the high maintenance burden from model duplication when we are going to do the model optimization. Some of our optimized models diverges a little from the vllm's modeling, but needs to rewrite several part of original one, brings negligible maintenance bruden to the vllm-ascend.In order to solve that, we propose to leverage `torch.compile` and `inductor pattern matcher`, automatically fuse the pattern we want to merge. For more details can refer to the RFC https://github.com/vllm-project/vllm-ascend/issues/4239 This pr integrates `AddRMSNorm` and the `Quant` operator, which can improve the inference speed of models using `w8a8 `quantization. ### Does this PR introduce _any_ user-facing change? Yes, add new additional_config ### How was this patch tested? ```python def main(): prompts = [ "The president of the United States is Mr.", ] # Create a sampling params object. sampling_params = SamplingParams(max_tokens=100, temperature=0.6, top_k=40, top_p=0.95) # Create an LLM. llm = LLM( model="/root/.cache/modelscope/hub/models/vllm-ascend/Qwen3-8B-W8A8", # enforce_eager=True, tensor_parallel_size=1, trust_remote_code=True, gpu_memory_utilization=0.7, quantization="ascend", ) # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` ```text Prompt: 'The president of the United States is Mr.', Generated text: ' Trump. The president of the United States is Mr. Biden. Which of the following statements is correct? \n\nA. Mr. Trump is Mr. Biden. \nB. Mr. Trump is not Mr. Biden. \nC. The president of the United States is not Mr. Trump. \nD. The president of the United States is not Mr. Biden.\n\nThe question presents a contradiction: it states that "The president of the United States is Mr. Trump" and "The president of' ``` - vLLM version: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 - vLLM main: https://github.com/vllm-project/vllm/commit/86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 --------- Signed-off-by: Icey <1790571317@qq.com> Signed-off-by: wxsIcey <1790571317@qq.com>
2025-12-04 10:29:48 +08:00
ascend_compilation_config = additional_config.get(
"ascend_compilation_config", {})
self.ascend_compilation_config = AscendCompilationConfig(
**ascend_compilation_config)
finegrained_tp_config = additional_config.get("finegrained_tp_config",
{})
self.finegrained_tp_config = FinegrainedTPConfig(
finegrained_tp_config, vllm_config)
[EPLB]Eplb Config Renaming (#5533) ### What this PR does / why we need it? 1. Rename num_iterations_eplb_update to expert_heat_collection_interval. 2. Rename num_wait_worker_iterations to algorithm_execution_interval. 3. Rename init_redundancy_expert to num_redundant_experts because the variable with the same meaning in vLLM is named this way. 4. Delete gate_eplb because we don't need this feature. 5. Move eplb config into a dict in additional config. 6. Depend on pr5817 ### Does this PR introduce _any_ user-facing change? before this pr: `--additional-config '{"dynamic_eplb":true, "num_iterations_eplb_update": 4000, "num_wait_worker_iterations": 150, "init_redundancy_expert": 16, "expert_map_path": "xxx.json"}'` after this pr: `--additional-config '{"eplb_config":{"dynamic_eplb":true,"expert_heat_collection_interval":4000, "algorithm_execution_interval":150,"num_redundant_experts": 16, "expert_map_path": "xxx.json"}}'` ### How was this patch tested? #### test qwen3-235b eplb num_redundant_experts=16 without pr5817 | dataset | version | metric | mode | vllm-api-general-chat | |----- | ----- | ----- | ----- | -----| | aime2024 | 604a78 | accuracy | gen | 83.33 | with pr5817 | dataset | version | metric | mode | vllm-api-general-chat | |----- | ----- | ----- | ----- | -----| | aime2024 | 604a78 | accuracy | gen | 86.67 | - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/45c1ca1ca1ee8fa06df263c8715e8a412ff408d4 Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2026-01-15 10:26:44 +08:00
eplb_config = additional_config.get("eplb_config", {})
self.eplb_config = EplbConfig(eplb_config)
# Dump / PrecisionDebugger configuration
self.dump_config_path = additional_config.get("dump_config_path", None)
weight_prefetch_config = additional_config.get(
"weight_prefetch_config", {})
self.weight_prefetch_config = WeightPrefetchConfig(
weight_prefetch_config)
self.layer_sharding = additional_config.get("layer_sharding", None)
logger.info_once(
f"Linear layer sharding enabled with config: {self.layer_sharding}. "
"Note: This feature works optimally with FLASHCOMM2 and DSA-CP enabled; "
"using it without these features may result in significant performance degradation."
)
[main][prefill optimization] Optimize parallel strategies to reduce communication overhead (#2198) ### What this PR does / why we need it? 1.Shared Expert Sharding Strategy Update: Switched from TP-aligned to pure DP for shared experts, enabling more efficient execution. 2.O_Proj AllReduce → ReduceScatter: Reduced communication overhead by using ReduceScatter, made possible by pure DP sharding. 3.AllGather Postponed: Delayed to after QKV down projection to reduce synchronization impact during prefill. ### How was this patch tested? Adding ut case in `tests/ut/attention/test_mla_v1.py` #### How to run use parameter `--additional_config='{"enable_shared_expert_dp": true}'` ##### a.How to run eager mode eg: python -m vllm.entrypoints.openai.api_server --model=/model_path --trust-remote-code -tp 8 -dp 2 --enable_expert_parallel --port 8002 --max-model-len 5120 --max-num-batched-tokens 16384 --enforce-eager --disable-log-requests --additional_config='{"ascend_scheduler_config":{"enabled":true},"enable_shared_expert_dp": true,"chunked_prefill_for_mla":true}' ##### b.How to run graph mode eg: python -m vllm.entrypoints.openai.api_server --model=/model_path --trust-remote-code -tp 8 -dp 2 --enable_expert_parallel --port 8002 --max-model-len 5120 --max-num-batched-tokens 16384 --disable-log-requests --additional_config='{"ascend_scheduler_config":{"enabled":true},"enable_shared_expert_dp": true,"chunked_prefill_for_mla":true,"torchair_graph_config":{"enabled":true}}' - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/9edd1db02bc6dce6da503503a373657f3466a78b --------- Signed-off-by: Wang Kunpeng <1289706727@qq.com> Signed-off-by: SlightwindSec <slightwindsec@gmail.com> Co-authored-by: SlightwindSec <slightwindsec@gmail.com>
2025-08-12 14:12:12 +08:00
self.enable_shared_expert_dp = additional_config.get(
"enable_shared_expert_dp",
False) and vllm_config.parallel_config.enable_expert_parallel
if self.enable_shared_expert_dp:
from vllm_ascend.utils import enable_sp
assert enable_sp(vllm_config=vllm_config,
enable_shared_expert_dp=True)
self.multistream_overlap_shared_expert = additional_config.get(
"multistream_overlap_shared_expert", False)
self.multistream_overlap_gate = additional_config.get(
"multistream_overlap_gate", False)
self.recompute_scheduler_enable = additional_config.get(
"recompute_scheduler_enable", False)
self.enable_cpu_binding = additional_config.get(
"enable_cpu_binding", False)
self.pd_tp_ratio = 1
self.pd_head_ratio = 1
self.num_head_replica = 1
if vllm_config.kv_transfer_config is not None and not vllm_config.model_config.is_deepseek_mla:
prefill_tp_size = vllm_config.kv_transfer_config.get_from_extra_config(
"prefill", {"tp_size": 1})["tp_size"]
decode_tp_size = vllm_config.kv_transfer_config.get_from_extra_config(
"decode", {"tp_size": 1})["tp_size"]
assert prefill_tp_size % decode_tp_size == 0, "Prefill TP size must be divisible by Decode TP size."
self.pd_tp_ratio = prefill_tp_size // decode_tp_size
if self.pd_tp_ratio > 1:
try:
# only support Qwen model now
# TODO: use a more robust method to get kv_head_num
num_kv_head = vllm_config.model_config.hf_text_config.num_key_value_heads
self.num_head_replica = prefill_tp_size // num_kv_head if prefill_tp_size >= num_kv_head else 1
prefill_tp_size = min(prefill_tp_size, num_kv_head)
decode_tp_size = min(decode_tp_size, num_kv_head)
self.pd_head_ratio = prefill_tp_size // decode_tp_size
except Exception:
[CI]Add Disaggregated PD Nightly Test for Qwen3-235B and Qwen3-VL-235B (#5502) ### What this PR does / why we need it? This PR adds online **Disaggregated Prefill/Decode** performance and accuracy tests for the **Qwen3-235B-A22B** and **Qwen3-VL-235B-A22B-Instruct** models to the Nightly test suite. These test configurations simulate the deployment of massive MoE and Vision-Language models in **a dual-node (32 NPU)** environment, utilizing Mooncake (KVCache Transfer) technology to achieve efficient KV cache transfer between the Prefill node and the Decode node. #### Test Configuration **Qwen3-235B-A22B** - Model: Qwen/Qwen3-235B-A22B - Hardware: A3, 2 Nodes (32 NPUs total, 16 NPUs per node) - Architecture: Disaggregated Prefill & Decode - Node 0 (Producer/Prefill): **DP2 + TP8 + EP + FLASHCOMM1 + FUSED_MC2**. - Node 1 (Consumer/Decode): **DP4 + TP4 + EP + FLASHCOMM1 + FUSED_MC2 + FULL_DECODE_ONLY**. - Benchmarks: - Performance: vllm-ascend/GSM8K-in3500-bs2800. - Accuracy: vllm-ascend/gsm8k-lite. **Qwen3-VL-235B-A22B-Instruct** - Model: Qwen/Qwen3-VL-235B-A22B-Instruct - Hardware: A3, 2 Nodes (32 NPUs total, 16 NPUs per node) - Architecture: Disaggregated Prefill & Decode - Node 0 (Producer/Prefill): **DP2 + TP8 + EP**. - Node 1 (Consumer/Decode): **DP4 + TP4 + EP + FULL_DECODE_ONLY**. - Benchmarks: - Performance: vllm-ascend/textvqa-perf-1080p. - Accuracy: vllm-ascend/textvqa-lite. ### How was this patch tested? Nightly test action on CI - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/45c1ca1ca1ee8fa06df263c8715e8a412ff408d4 --------- Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-09 16:25:20 +08:00
raise ValueError(
"The text_config extracted from the model config does not have "
"`num_key_value_heads` attribute. This indicates a mismatch "
"between the model config and vLLM's expectations. Please "
"ensure that the model config is compatible with vLLM."
)
if self.pd_tp_ratio == 0:
raise AssertionError(
"Only support P node tp size lagger then D node tp size")
self.SLO_limits_for_dynamic_batch = additional_config.get(
"SLO_limits_for_dynamic_batch", -1)
[Feat] Flashcomm2 use o_shared linear (#4188) ### What this PR does / why we need it? It is mentioned in the [flashcomm2 technical report](https://gitcode.com/ascend-tribe/ascend-inference-cluster/blob/main/FlashComm/FlashComm2%E5%A4%A7%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86%E4%B8%AD%E4%BB%A5%E5%AD%98%E6%8D%A2%E4%BC%A0%E7%9A%84%E9%80%9A%E4%BF%A1%E4%BC%98%E5%8C%96%E6%8A%80%E6%9C%AF.pdf) that FC2 will introduce full redundant storage of the o_proj matrix, which will put pressure on the memory. Therefore, the technical report proposed a compromise solution using otp2, but it will introduce additional reduce-scatter communication. We propose a shared linear feature (#2931 ) that supports distributing weights layer by layer to each card, avoiding the need for TP splitting, and can solve the memory issue. This PR depends on #3232 and #2931 ### Flashcomm2 flowchart <img width="1142" height="878" alt="PixPin_2025-11-14_13-37-39" src="https://github.com/user-attachments/assets/d45ea8db-d8ef-4d45-8e18-abd4d82ce3e0" /> ### Does this PR introduce _any_ user-facing change? Use environment variables ```bash export VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE=1 export VLLM_ASCEND_ENABLE_FLASHCOMM2_OSHARED=1 ``` - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: zzhx1 <zzh_201018@outlook.com> Signed-off-by: zzhxx <2783294813@qq.com> Co-authored-by: zzh02232027 <zzh02232027@antgroup.com> Co-authored-by: clrs97 <524936896@qq.com> Co-authored-by: Levi-JQ <yujinqi2@huawei.com>
2025-12-11 12:43:04 +08:00
from vllm_ascend.utils import get_flashcomm2_config_and_validate
self.flashcomm2_oproj_tensor_parallel_size = get_flashcomm2_config_and_validate(
self, vllm_config)
[Feature] Support npuhraph_ex backend (#4700) ### What this PR does / why we need it? We introduced the npugraph_ex backend through the vllm's adaptor dispatch mechanism to accelerate aclgraph. This solution is based on torch.compile and uses torchair to optimize the fx.graph. The performance gains are mainly obtained from the static kernel. We conducted tests on Qwen3-30B and achieved over 5% performance optimization. ### Does this PR introduce _any_ user-facing change? Yes, we add a new switch named"enable_npugraph_ex" in additional_config, default is False. We also add an example to show how to register custom replacement pass ### More information about this PR This feature depends on the release of CANN and torch_npu in Q4. We tested it on a package that has not been publicly released yet and verified that the functionality works. This feature is still experimental at the moment; setting the config true will directly raise error. Merging into the main branch initially involves some preliminary commits to facilitate subsequent development and testing of the feature, as well as to avoid submitting an excessively large PR at once. - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: chencangtao <chencangtao@huawei.com> Signed-off-by: ChenCangtao <50493711+ChenCangtao@users.noreply.github.com> Co-authored-by: chencangtao <chencangtao@huawei.com> Co-authored-by: panchao-hub <315134829@qq.com> Co-authored-by: wbigat <wbigat@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-12-10 20:48:05 +08:00
self.enable_npugraph_ex = additional_config.get(
"enable_npugraph_ex", False)
# We find that _npu_paged_attention still performs better than
# npu_fused_infer_attention_score in some cases. We allow to execute
# _npu_paged_attention in this cases. This should be removed once
# npu_fused_infer_attention_score performs better on all scenarios.
self.pa_shape_list = additional_config.get("pa_shape_list", [])
self.enable_async_exponential = bool(
additional_config.get("enable_async_exponential", False))
self.enable_kv_nz = additional_config.get("enable_kv_nz", False)
if self.enable_kv_nz:
use_sparse = hasattr(vllm_config.model_config.hf_text_config,
"index_topk")
if not vllm_config.model_config.is_deepseek_mla or use_sparse:
raise RuntimeError(
"enable_kv_nz is only supported for mla currently.")
if vllm_config.kv_transfer_config is None \
or not vllm_config.kv_transfer_config.is_kv_consumer:
raise NotImplementedError(
"enable_kv_nz is only supported in pd scenario and can "
"only be used in D node.")
class FinegrainedTPConfig:
"""
Configuration Object for finegrained_tp_config from additional_config
"""
def __init__(self, finegrained_tp_config: dict, vllm_config):
self.oproj_tensor_parallel_size = finegrained_tp_config.get(
"oproj_tensor_parallel_size", 0)
self.lmhead_tensor_parallel_size = finegrained_tp_config.get(
"lmhead_tensor_parallel_size", 0)
self.embedding_tensor_parallel_size = finegrained_tp_config.get(
"embedding_tensor_parallel_size", 0)
self.mlp_tensor_parallel_size = finegrained_tp_config.get(
"mlp_tensor_parallel_size", 0)
enabled_configs = []
if self.oproj_tensor_parallel_size > 0:
enabled_configs.append(
f"oproj_tensor_parallel_size={self.oproj_tensor_parallel_size}"
)
# dummy_run does not run the entire attention module in eager mode,, so the o_proj tp split can only be used in graph mode.
if vllm_config.model_config.enforce_eager is True:
raise AssertionError(
"oproj_tensor_parallel_size is only supported in graph mode"
)
if vllm_config.kv_transfer_config is None or not vllm_config.kv_transfer_config.is_kv_consumer:
raise AssertionError(
"oproj_tensor_parallel_size is only supported in pd scenario and can only be used in D node."
)
if self.lmhead_tensor_parallel_size > 0:
enabled_configs.append(
f"lmhead_tensor_parallel_size={self.lmhead_tensor_parallel_size}"
)
if self.embedding_tensor_parallel_size > 0:
enabled_configs.append(
f"embedding_tensor_parallel_size={self.embedding_tensor_parallel_size}"
)
if self.mlp_tensor_parallel_size > 0:
enabled_configs.append(
f"mlp_tensor_parallel_size={self.mlp_tensor_parallel_size}")
module_tp_sizes = [
self.oproj_tensor_parallel_size,
self.lmhead_tensor_parallel_size,
self.embedding_tensor_parallel_size,
self.mlp_tensor_parallel_size,
]
for module_tp_size in module_tp_sizes:
if module_tp_size > 0 and vllm_config.parallel_config.data_parallel_size % module_tp_size != 0:
raise AssertionError(
"module tp sizes must divide data_parallel_size")
if any(size > 0 for size in module_tp_sizes) and enabled_configs:
logger.info(
f"finegrained_tp_config enabled: {', '.join(enabled_configs)}")
Adopt inductor fusion and define quantization fusion pass (#4168) ### What this PR does / why we need it? The main goal of this PR to alleviate the high maintenance burden from model duplication when we are going to do the model optimization. Some of our optimized models diverges a little from the vllm's modeling, but needs to rewrite several part of original one, brings negligible maintenance bruden to the vllm-ascend.In order to solve that, we propose to leverage `torch.compile` and `inductor pattern matcher`, automatically fuse the pattern we want to merge. For more details can refer to the RFC https://github.com/vllm-project/vllm-ascend/issues/4239 This pr integrates `AddRMSNorm` and the `Quant` operator, which can improve the inference speed of models using `w8a8 `quantization. ### Does this PR introduce _any_ user-facing change? Yes, add new additional_config ### How was this patch tested? ```python def main(): prompts = [ "The president of the United States is Mr.", ] # Create a sampling params object. sampling_params = SamplingParams(max_tokens=100, temperature=0.6, top_k=40, top_p=0.95) # Create an LLM. llm = LLM( model="/root/.cache/modelscope/hub/models/vllm-ascend/Qwen3-8B-W8A8", # enforce_eager=True, tensor_parallel_size=1, trust_remote_code=True, gpu_memory_utilization=0.7, quantization="ascend", ) # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` ```text Prompt: 'The president of the United States is Mr.', Generated text: ' Trump. The president of the United States is Mr. Biden. Which of the following statements is correct? \n\nA. Mr. Trump is Mr. Biden. \nB. Mr. Trump is not Mr. Biden. \nC. The president of the United States is not Mr. Trump. \nD. The president of the United States is not Mr. Biden.\n\nThe question presents a contradiction: it states that "The president of the United States is Mr. Trump" and "The president of' ``` - vLLM version: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 - vLLM main: https://github.com/vllm-project/vllm/commit/86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 --------- Signed-off-by: Icey <1790571317@qq.com> Signed-off-by: wxsIcey <1790571317@qq.com>
2025-12-04 10:29:48 +08:00
class AscendCompilationConfig:
"""
Configuration for controlling the behavior of Ascend graph optimization.
This class provides a way to configure graph fusion optimizations.
These configurations directly impact the performance and behavior of models
deployed on Ascend platforms.
"""
def __init__(self,
fuse_norm_quant: bool = True,
fuse_qknorm_rope: bool = False,
**kwargs):
Adopt inductor fusion and define quantization fusion pass (#4168) ### What this PR does / why we need it? The main goal of this PR to alleviate the high maintenance burden from model duplication when we are going to do the model optimization. Some of our optimized models diverges a little from the vllm's modeling, but needs to rewrite several part of original one, brings negligible maintenance bruden to the vllm-ascend.In order to solve that, we propose to leverage `torch.compile` and `inductor pattern matcher`, automatically fuse the pattern we want to merge. For more details can refer to the RFC https://github.com/vllm-project/vllm-ascend/issues/4239 This pr integrates `AddRMSNorm` and the `Quant` operator, which can improve the inference speed of models using `w8a8 `quantization. ### Does this PR introduce _any_ user-facing change? Yes, add new additional_config ### How was this patch tested? ```python def main(): prompts = [ "The president of the United States is Mr.", ] # Create a sampling params object. sampling_params = SamplingParams(max_tokens=100, temperature=0.6, top_k=40, top_p=0.95) # Create an LLM. llm = LLM( model="/root/.cache/modelscope/hub/models/vllm-ascend/Qwen3-8B-W8A8", # enforce_eager=True, tensor_parallel_size=1, trust_remote_code=True, gpu_memory_utilization=0.7, quantization="ascend", ) # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` ```text Prompt: 'The president of the United States is Mr.', Generated text: ' Trump. The president of the United States is Mr. Biden. Which of the following statements is correct? \n\nA. Mr. Trump is Mr. Biden. \nB. Mr. Trump is not Mr. Biden. \nC. The president of the United States is not Mr. Trump. \nD. The president of the United States is not Mr. Biden.\n\nThe question presents a contradiction: it states that "The president of the United States is Mr. Trump" and "The president of' ``` - vLLM version: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 - vLLM main: https://github.com/vllm-project/vllm/commit/86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 --------- Signed-off-by: Icey <1790571317@qq.com> Signed-off-by: wxsIcey <1790571317@qq.com>
2025-12-04 10:29:48 +08:00
"""
Initialize the configuration.
Args:
fuse_norm_quant (bool): Whether to enable norm and quant fusion optimization.
When set to True, the system will optimize norm and quant operations.
Adopt inductor fusion and define quantization fusion pass (#4168) ### What this PR does / why we need it? The main goal of this PR to alleviate the high maintenance burden from model duplication when we are going to do the model optimization. Some of our optimized models diverges a little from the vllm's modeling, but needs to rewrite several part of original one, brings negligible maintenance bruden to the vllm-ascend.In order to solve that, we propose to leverage `torch.compile` and `inductor pattern matcher`, automatically fuse the pattern we want to merge. For more details can refer to the RFC https://github.com/vllm-project/vllm-ascend/issues/4239 This pr integrates `AddRMSNorm` and the `Quant` operator, which can improve the inference speed of models using `w8a8 `quantization. ### Does this PR introduce _any_ user-facing change? Yes, add new additional_config ### How was this patch tested? ```python def main(): prompts = [ "The president of the United States is Mr.", ] # Create a sampling params object. sampling_params = SamplingParams(max_tokens=100, temperature=0.6, top_k=40, top_p=0.95) # Create an LLM. llm = LLM( model="/root/.cache/modelscope/hub/models/vllm-ascend/Qwen3-8B-W8A8", # enforce_eager=True, tensor_parallel_size=1, trust_remote_code=True, gpu_memory_utilization=0.7, quantization="ascend", ) # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` ```text Prompt: 'The president of the United States is Mr.', Generated text: ' Trump. The president of the United States is Mr. Biden. Which of the following statements is correct? \n\nA. Mr. Trump is Mr. Biden. \nB. Mr. Trump is not Mr. Biden. \nC. The president of the United States is not Mr. Trump. \nD. The president of the United States is not Mr. Biden.\n\nThe question presents a contradiction: it states that "The president of the United States is Mr. Trump" and "The president of' ``` - vLLM version: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 - vLLM main: https://github.com/vllm-project/vllm/commit/86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 --------- Signed-off-by: Icey <1790571317@qq.com> Signed-off-by: wxsIcey <1790571317@qq.com>
2025-12-04 10:29:48 +08:00
Default: True
fuse_qknorm_rope (bool): Whether to enable qknorm and rope fusion optimization.
Default: False
Adopt inductor fusion and define quantization fusion pass (#4168) ### What this PR does / why we need it? The main goal of this PR to alleviate the high maintenance burden from model duplication when we are going to do the model optimization. Some of our optimized models diverges a little from the vllm's modeling, but needs to rewrite several part of original one, brings negligible maintenance bruden to the vllm-ascend.In order to solve that, we propose to leverage `torch.compile` and `inductor pattern matcher`, automatically fuse the pattern we want to merge. For more details can refer to the RFC https://github.com/vllm-project/vllm-ascend/issues/4239 This pr integrates `AddRMSNorm` and the `Quant` operator, which can improve the inference speed of models using `w8a8 `quantization. ### Does this PR introduce _any_ user-facing change? Yes, add new additional_config ### How was this patch tested? ```python def main(): prompts = [ "The president of the United States is Mr.", ] # Create a sampling params object. sampling_params = SamplingParams(max_tokens=100, temperature=0.6, top_k=40, top_p=0.95) # Create an LLM. llm = LLM( model="/root/.cache/modelscope/hub/models/vllm-ascend/Qwen3-8B-W8A8", # enforce_eager=True, tensor_parallel_size=1, trust_remote_code=True, gpu_memory_utilization=0.7, quantization="ascend", ) # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` ```text Prompt: 'The president of the United States is Mr.', Generated text: ' Trump. The president of the United States is Mr. Biden. Which of the following statements is correct? \n\nA. Mr. Trump is Mr. Biden. \nB. Mr. Trump is not Mr. Biden. \nC. The president of the United States is not Mr. Trump. \nD. The president of the United States is not Mr. Biden.\n\nThe question presents a contradiction: it states that "The president of the United States is Mr. Trump" and "The president of' ``` - vLLM version: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 - vLLM main: https://github.com/vllm-project/vllm/commit/86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 --------- Signed-off-by: Icey <1790571317@qq.com> Signed-off-by: wxsIcey <1790571317@qq.com>
2025-12-04 10:29:48 +08:00
**kwargs: Additional optional parameters for forward compatibility and configuration extension.
"""
self.fuse_norm_quant = fuse_norm_quant
self.fuse_qknorm_rope = HAS_TRITON or fuse_qknorm_rope
Adopt inductor fusion and define quantization fusion pass (#4168) ### What this PR does / why we need it? The main goal of this PR to alleviate the high maintenance burden from model duplication when we are going to do the model optimization. Some of our optimized models diverges a little from the vllm's modeling, but needs to rewrite several part of original one, brings negligible maintenance bruden to the vllm-ascend.In order to solve that, we propose to leverage `torch.compile` and `inductor pattern matcher`, automatically fuse the pattern we want to merge. For more details can refer to the RFC https://github.com/vllm-project/vllm-ascend/issues/4239 This pr integrates `AddRMSNorm` and the `Quant` operator, which can improve the inference speed of models using `w8a8 `quantization. ### Does this PR introduce _any_ user-facing change? Yes, add new additional_config ### How was this patch tested? ```python def main(): prompts = [ "The president of the United States is Mr.", ] # Create a sampling params object. sampling_params = SamplingParams(max_tokens=100, temperature=0.6, top_k=40, top_p=0.95) # Create an LLM. llm = LLM( model="/root/.cache/modelscope/hub/models/vllm-ascend/Qwen3-8B-W8A8", # enforce_eager=True, tensor_parallel_size=1, trust_remote_code=True, gpu_memory_utilization=0.7, quantization="ascend", ) # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` ```text Prompt: 'The president of the United States is Mr.', Generated text: ' Trump. The president of the United States is Mr. Biden. Which of the following statements is correct? \n\nA. Mr. Trump is Mr. Biden. \nB. Mr. Trump is not Mr. Biden. \nC. The president of the United States is not Mr. Trump. \nD. The president of the United States is not Mr. Biden.\n\nThe question presents a contradiction: it states that "The president of the United States is Mr. Trump" and "The president of' ``` - vLLM version: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 - vLLM main: https://github.com/vllm-project/vllm/commit/86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 --------- Signed-off-by: Icey <1790571317@qq.com> Signed-off-by: wxsIcey <1790571317@qq.com>
2025-12-04 10:29:48 +08:00
class XliteGraphConfig:
"""
Configuration Object for xlite_graph_config from additional_config
"""
def __init__(self, xlite_graph_config, vllm_config):
self.enabled = xlite_graph_config.get("enabled", False)
self.full_mode = xlite_graph_config.get("full_mode", False)
if self.enabled:
if bool(vllm_config.speculative_config):
raise RuntimeError(
"Xlite graph mode is not compatible with speculative decoding. Please disable speculative decoding."
)
if vllm_config.parallel_config.pipeline_parallel_size > 1:
raise RuntimeError(
"Xlite graph mode is not compatible with pipeline parallelism. Please set pipeline_parallel_size to 1."
)
if vllm_config.cache_config.block_size != 128:
raise RuntimeError(
"Xlite graph mode is only compatible with block_size of 128. Please set block_size to 128."
)
class WeightPrefetchConfig:
"""
Configuration Object for weight_prefetch_config from additional_config
"""
prefetch_ratio: dict = {
"attn": {
"qkv": 1.0,
"o": 1.0,
},
"moe": {
"gate_up": 0.8
}
}
def __init__(self, weight_prefetch_config: dict):
self.enabled = weight_prefetch_config.get("enabled", False)
self.prefetch_ratio = weight_prefetch_config.get(
"prefetch_ratio", self.prefetch_ratio)
[EPLB]Eplb Config Renaming (#5533) ### What this PR does / why we need it? 1. Rename num_iterations_eplb_update to expert_heat_collection_interval. 2. Rename num_wait_worker_iterations to algorithm_execution_interval. 3. Rename init_redundancy_expert to num_redundant_experts because the variable with the same meaning in vLLM is named this way. 4. Delete gate_eplb because we don't need this feature. 5. Move eplb config into a dict in additional config. 6. Depend on pr5817 ### Does this PR introduce _any_ user-facing change? before this pr: `--additional-config '{"dynamic_eplb":true, "num_iterations_eplb_update": 4000, "num_wait_worker_iterations": 150, "init_redundancy_expert": 16, "expert_map_path": "xxx.json"}'` after this pr: `--additional-config '{"eplb_config":{"dynamic_eplb":true,"expert_heat_collection_interval":4000, "algorithm_execution_interval":150,"num_redundant_experts": 16, "expert_map_path": "xxx.json"}}'` ### How was this patch tested? #### test qwen3-235b eplb num_redundant_experts=16 without pr5817 | dataset | version | metric | mode | vllm-api-general-chat | |----- | ----- | ----- | ----- | -----| | aime2024 | 604a78 | accuracy | gen | 83.33 | with pr5817 | dataset | version | metric | mode | vllm-api-general-chat | |----- | ----- | ----- | ----- | -----| | aime2024 | 604a78 | accuracy | gen | 86.67 | - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/45c1ca1ca1ee8fa06df263c8715e8a412ff408d4 Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2026-01-15 10:26:44 +08:00
class EplbConfig:
"""
Configuration Object for xlite_graph_config from additional_config
"""
_defaults = {
"dynamic_eplb": False,
"expert_map_path": None,
"expert_heat_collection_interval": 400,
"algorithm_execution_interval": 30,
"expert_map_record_path": None,
"num_redundant_experts": 0,
"eplb_policy_type": 1
}
def __init__(self, user_config: dict = {}):
self.config = self._defaults.copy()
if user_config and isinstance(user_config, dict):
for key, value in user_config.items():
if key in self.config:
self.config[key] = value
else:
raise ValueError(f"Config has no attribute '{key}'")
self._validate_config()
def __getattr__(self, key):
if key in self.config:
return self.config[key]
raise AttributeError(f"Config has no attribute '{key}'")
def _validate_config(self):
if self.expert_map_path is not None:
if self.expert_map_path[-5:] != ".json":
raise TypeError("The expert_map is not json.")
if not os.path.exists(self.expert_map_path):
raise ValueError("The expert_map is not exist.")
if self.expert_map_record_path is not None:
self.config["dynamic_eplb"] = True
if self.expert_map_record_path[-5:] != ".json":
raise TypeError("The expert_map_record_path is not json.")
dirname = os.path.dirname(self.expert_map_record_path)
os.makedirs(dirname, exist_ok=True)
for key in [
"expert_heat_collection_interval",
"algorithm_execution_interval", "num_redundant_experts"
]:
if not isinstance(self.config[key], int):
raise TypeError(f"{key} must be an integer")
if self.config[key] < 0: # type: ignore
raise ValueError(
f"{key} must greater than 0; got {self.config[key]} instead"
)
if self.eplb_policy_type not in [0, 1, 2, 3]:
raise ValueError("eplb_policy_type must in [0, 1, 2, 3]")
_ASCEND_CONFIG: Optional[AscendConfig] = None
def init_ascend_config(vllm_config):
additional_config = vllm_config.additional_config if vllm_config.additional_config is not None else {}
refresh = additional_config.get("refresh",
False) if additional_config else False
global _ASCEND_CONFIG
if _ASCEND_CONFIG is not None and not refresh:
return _ASCEND_CONFIG
_ASCEND_CONFIG = AscendConfig(vllm_config)
return _ASCEND_CONFIG
def clear_ascend_config():
global _ASCEND_CONFIG
_ASCEND_CONFIG = None
def get_ascend_config():
global _ASCEND_CONFIG
if _ASCEND_CONFIG is None:
raise RuntimeError(
"Ascend config is not initialized. Please call init_ascend_config first."
)
return _ASCEND_CONFIG