xc-llm-ascend/vllm_ascend/xlite/xlite_model_runner.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
# Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py
# isort: skip_file
import torch.nn as nn
from vllm.config import CUDAGraphMode
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner


class XliteModelRunner(NPUModelRunner):
    def get_model(self) -> nn.Module:
        return self.model.unwrap()

    def load_model(self) -> None:
        super().load_model()
        from vllm_ascend.xlite.xlite import XliteWrapper

        self.model = XliteWrapper(self.model, self.vllm_config)

    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
        super().initialize_kv_cache(kv_cache_config)
        self.model.register_kv_caches(self.kv_caches)

    def _should_build_dummy_attn_metadata(
        self,
        force_attention: bool = False,
        is_profile: bool = False,
        cudagraph_runtime_mode: CUDAGraphMode | None = None,
    ) -> bool:
        """
        Override to build attention metadata during dummy_run when xlite is enable.
        For xlite, we need to build metadata during DP dummy_run to ensure all ranks
        have consistent metadata, even when some ranks have no requests.
        """
        base_condition = super()._should_build_dummy_attn_metadata(force_attention, is_profile, cudagraph_runtime_mode)
        xlite_condition = self.ascend_config.xlite_graph_config.enabled and not is_profile
        return base_condition or xlite_condition
[Feat] Add Euler xlite graph wrapper support (#4526) ### What this PR does / why we need it? This patch adds support for the xlite graph wrapper to vllm_ascend. Xlite provides operator implementations of the transformer network on Ascend hardware. For details about xlite, please refer to the following link: https://gitee.com/openeuler/GVirt/blob/master/xlite/README.md The latest performance comparison data between xlite and the default aclgraph mode is as follows: ## Qwen3 32B TPS 910B3(A2) Online Inference Performance Comparison - aclgraph: main(c4a71fc6) - xlite-full: main(c4a71fc6) + xlite-full - xlite-decode-only: main(c4a71fc6) + xlite-decode-only - diff1: Performance comparison between xlite-full and aclgraph - diff2: Performance comparison between xlite-decode-only and aclgraph ### Does this PR introduce _any_ user-facing change? Enable the xlite graph mode by setting xlite_graph_config: --additional-config='{"xlite_graph_config": {"enabled": true}}' # Enabled for decode only --additional-config='{"xlite_graph_config": {"enabled": true, "full_mode": true}}' # Enabled for prefill and decode - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: lulina <lina.lulina@huawei.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-12-08 08:27:46 +08:00			`#`
			`# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.`
			`# Copyright 2023 The vLLM team.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`# This file is a part of the vllm-ascend project.`
			`# Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py`
			`# isort: skip_file`
			`import torch.nn as nn`
[Feat]Xlite Qwen3 MoE Support Data Parallel (#6715) ### What this PR does / why we need it? This patch adds support for the Qwen3-MoE data parallel in Xlite. For more details about Xlite, please refer to the following link:[https://atomgit.com/openeuler/GVirt/blob/master/xlite/README.md](https://atomgit.com/openeuler/GVirt/blob/master/xlite/README.md). online server config: ```shell port=$1 log=$2 export VLLM_USE_V1=1 export TASK_QUEUE_ENABLE=1 export HCCL_BUFFSIZE=512 export HCCL_OP_EXPANSION_MODE="AIV" export OMP_PROC_BIND=false export VLLM_ASCEND_ENABLE_NZ=0 sysctl -w vm.swappiness=0 sysctl -w kernel.numa_balancing=0 sysctl kernel.sched_migration_cost_ns=50000 ip=127.0.0.1 python -m vllm.entrypoints.openai.api_server \ --model /mnt/nvme1n1/wy/models/Qwen3-30B-A3B \ --tensor-parallel-size 2 \ --enable-expert-parallel \ --data-parallel-size 4 \ --gpu-memory-utilization 0.9 \ --max-num-batched-tokens 32768 \ --data-parallel-size-local 4 \ --max-num-seqs=200 \ --block-size 128 \ --max-model-len 6656 \ --trust-remote-code \ --disable-log-requests \ --served-model-name qwen \ --no-enable-prefix-caching \ --additional-config '{"xlite_graph_config": {"enabled": true, "full_mode": true}, "enable_cpu_binding": true}' \ --compilation-config '{"cudagraph_capture_sizes":[1, 16, 32, 48, 64, 100, 150, 200], "cudagraph_mode": "FULL_DECODE_ONLY"}' \ --async-scheduling \ --host ${ip} \ --port ${port} > ${log} 2>&1 & ``` test_config: ```shell vllm bench serve \ --max-concurrency ${maxconcurrency} \ --num-prompts ${num_prompts} \ --host ${HOST} \ --port ${PORT} \ --model ${MODEL_NAME} \ --dataset-name random \ --backend openai-chat \ --random-input-len 512 \ --random-output-len 512 \ --random-range-ratio 0.2 \ --temperature 0.6 \ --metric-percentiles "50,90,99" \ --tokenizer ${TOKENIZER_PATH} \ --endpoint /v1/chat/completions \ --ignore-eos ``` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - vLLM version: v0.16.0 - vLLM main: https://github.com/vllm-project/vllm/commit/c86cdcbcd2d49c4d4cd38339315bacb1d8b2a1c0 Signed-off-by: uuzWY <Ethan.wangyuan@huawei.com> Co-authored-by: uuzWY <Ethan.wangyuan@huawei.com> 2026-03-09 17:53:35 +08:00			`from vllm.config import CUDAGraphMode`
[Feat] Add Euler xlite graph wrapper support (#4526) ### What this PR does / why we need it? This patch adds support for the xlite graph wrapper to vllm_ascend. Xlite provides operator implementations of the transformer network on Ascend hardware. For details about xlite, please refer to the following link: https://gitee.com/openeuler/GVirt/blob/master/xlite/README.md The latest performance comparison data between xlite and the default aclgraph mode is as follows: ## Qwen3 32B TPS 910B3(A2) Online Inference Performance Comparison - aclgraph: main(c4a71fc6) - xlite-full: main(c4a71fc6) + xlite-full - xlite-decode-only: main(c4a71fc6) + xlite-decode-only - diff1: Performance comparison between xlite-full and aclgraph - diff2: Performance comparison between xlite-decode-only and aclgraph ### Does this PR introduce _any_ user-facing change? Enable the xlite graph mode by setting xlite_graph_config: --additional-config='{"xlite_graph_config": {"enabled": true}}' # Enabled for decode only --additional-config='{"xlite_graph_config": {"enabled": true, "full_mode": true}}' # Enabled for prefill and decode - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: lulina <lina.lulina@huawei.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-12-08 08:27:46 +08:00			`from vllm.v1.kv_cache_interface import KVCacheConfig`
			`from vllm_ascend.worker.model_runner_v1 import NPUModelRunner`


			`class XliteModelRunner(NPUModelRunner):`
			`def get_model(self) -> nn.Module:`
			`return self.model.unwrap()`

			`def load_model(self) -> None:`
			`super().load_model()`
			`from vllm_ascend.xlite.xlite import XliteWrapper`
[Lint]Style: Convert `vllm-ascend/` to ruff format(Batch #10) (#6173) ### What this PR does / why we need it? Scope of Changes: \| File Path \| \| :--- \| \|`vllm_ascend/ops/layer_shard_linear.py`\| \|`vllm_ascend/ops/linear.py`\| \|`vllm_ascend/ops/linear_op.py`\| \|`vllm_ascend/worker/worker.py`\| \| ` vllm_ascend/patch/worker/patch_bert.py` \| \| ` vllm_ascend/patch/worker/patch_deepseek.py` \| \| ` vllm_ascend/patch/worker/patch_distributed.py` \| \| ` vllm_ascend/patch/worker/patch_module.py` \| \| ` vllm_ascend/patch/worker/patch_multimodal_merge.py` \| \| ` vllm_ascend/patch/worker/patch_qwen3_next.py` \| \| ` vllm_ascend/patch/worker/patch_qwen3_next_mtp.py` \| \| ` vllm_ascend/patch/worker/patch_rejection_sampler.py` \| \| ` vllm_ascend/patch/worker/patch_rope.py` \| \| ` vllm_ascend/patch/worker/patch_triton.py` \| \| ` vllm_ascend/patch/worker/patch_unquantized_gemm.py` \| \| ` vllm_ascend/patch/worker/patch_v2_egale.py` \| \|` vllm_ascend/worker/npu_input_batch.py`\| \|` vllm_ascend/worker/v2/aclgraph_utils.py`\| \|` vllm_ascend/worker/v2/attn_utils.py`\| \|` vllm_ascend/worker/v2/model_runner.py`\| \|` vllm_ascend/worker/v2/sample/gumbel.py`\| \|` vllm_ascend/worker/v2/sample/penalties.py`\| \|` vllm_ascend/worker/v2/sample/sampler.py`\| \|` vllm_ascend/worker/v2/spec_decode/__init__.py`\| \|` vllm_ascend/worker/v2/spec_decode/eagle.py`\| \|` vllm_ascend/worker/v2/states.py`\| ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.14.0 - vLLM main: https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60 Signed-off-by: MrZ20 <2609716663@qq.com> Signed-off-by: SILONG ZENG <2609716663@qq.com> Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com> 2026-02-06 15:35:06 +08:00
[Feat] Add Euler xlite graph wrapper support (#4526) ### What this PR does / why we need it? This patch adds support for the xlite graph wrapper to vllm_ascend. Xlite provides operator implementations of the transformer network on Ascend hardware. For details about xlite, please refer to the following link: https://gitee.com/openeuler/GVirt/blob/master/xlite/README.md The latest performance comparison data between xlite and the default aclgraph mode is as follows: ## Qwen3 32B TPS 910B3(A2) Online Inference Performance Comparison - aclgraph: main(c4a71fc6) - xlite-full: main(c4a71fc6) + xlite-full - xlite-decode-only: main(c4a71fc6) + xlite-decode-only - diff1: Performance comparison between xlite-full and aclgraph - diff2: Performance comparison between xlite-decode-only and aclgraph ### Does this PR introduce _any_ user-facing change? Enable the xlite graph mode by setting xlite_graph_config: --additional-config='{"xlite_graph_config": {"enabled": true}}' # Enabled for decode only --additional-config='{"xlite_graph_config": {"enabled": true, "full_mode": true}}' # Enabled for prefill and decode - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: lulina <lina.lulina@huawei.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-12-08 08:27:46 +08:00			`self.model = XliteWrapper(self.model, self.vllm_config)`

			`def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:`
			`super().initialize_kv_cache(kv_cache_config)`
			`self.model.register_kv_caches(self.kv_caches)`
[Feat]Xlite Qwen3 MoE Support Data Parallel (#6715) ### What this PR does / why we need it? This patch adds support for the Qwen3-MoE data parallel in Xlite. For more details about Xlite, please refer to the following link:[https://atomgit.com/openeuler/GVirt/blob/master/xlite/README.md](https://atomgit.com/openeuler/GVirt/blob/master/xlite/README.md). online server config: ```shell port=$1 log=$2 export VLLM_USE_V1=1 export TASK_QUEUE_ENABLE=1 export HCCL_BUFFSIZE=512 export HCCL_OP_EXPANSION_MODE="AIV" export OMP_PROC_BIND=false export VLLM_ASCEND_ENABLE_NZ=0 sysctl -w vm.swappiness=0 sysctl -w kernel.numa_balancing=0 sysctl kernel.sched_migration_cost_ns=50000 ip=127.0.0.1 python -m vllm.entrypoints.openai.api_server \ --model /mnt/nvme1n1/wy/models/Qwen3-30B-A3B \ --tensor-parallel-size 2 \ --enable-expert-parallel \ --data-parallel-size 4 \ --gpu-memory-utilization 0.9 \ --max-num-batched-tokens 32768 \ --data-parallel-size-local 4 \ --max-num-seqs=200 \ --block-size 128 \ --max-model-len 6656 \ --trust-remote-code \ --disable-log-requests \ --served-model-name qwen \ --no-enable-prefix-caching \ --additional-config '{"xlite_graph_config": {"enabled": true, "full_mode": true}, "enable_cpu_binding": true}' \ --compilation-config '{"cudagraph_capture_sizes":[1, 16, 32, 48, 64, 100, 150, 200], "cudagraph_mode": "FULL_DECODE_ONLY"}' \ --async-scheduling \ --host ${ip} \ --port ${port} > ${log} 2>&1 & ``` test_config: ```shell vllm bench serve \ --max-concurrency ${maxconcurrency} \ --num-prompts ${num_prompts} \ --host ${HOST} \ --port ${PORT} \ --model ${MODEL_NAME} \ --dataset-name random \ --backend openai-chat \ --random-input-len 512 \ --random-output-len 512 \ --random-range-ratio 0.2 \ --temperature 0.6 \ --metric-percentiles "50,90,99" \ --tokenizer ${TOKENIZER_PATH} \ --endpoint /v1/chat/completions \ --ignore-eos ``` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - vLLM version: v0.16.0 - vLLM main: https://github.com/vllm-project/vllm/commit/c86cdcbcd2d49c4d4cd38339315bacb1d8b2a1c0 Signed-off-by: uuzWY <Ethan.wangyuan@huawei.com> Co-authored-by: uuzWY <Ethan.wangyuan@huawei.com> 2026-03-09 17:53:35 +08:00
			`def _should_build_dummy_attn_metadata(`
			`self,`
			`force_attention: bool = False,`
			`is_profile: bool = False,`
			`cudagraph_runtime_mode: CUDAGraphMode \| None = None,`
			`) -> bool:`
			`"""`
			`Override to build attention metadata during dummy_run when xlite is enable.`
			`For xlite, we need to build metadata during DP dummy_run to ensure all ranks`
			`have consistent metadata, even when some ranks have no requests.`
			`"""`
			`base_condition = super()._should_build_dummy_attn_metadata(force_attention, is_profile, cudagraph_runtime_mode)`
			`xlite_condition = self.ascend_config.xlite_graph_config.enabled and not is_profile`
			`return base_condition or xlite_condition`