[Feat] Add Euler xlite graph wrapper support (#4526)

### What this PR does / why we need it? This patch adds support for the xlite graph wrapper to vllm_ascend. Xlite provides operator implementations of the transformer network on Ascend hardware. For details about xlite, please refer to the following link: https://gitee.com/openeuler/GVirt/blob/master/xlite/README.md The latest performance comparison data between xlite and the default aclgraph mode is as follows: ## Qwen3 32B TPS 910B3(A2) Online Inference Performance Comparison - aclgraph: main(c4a71fc6) - xlite-full: main(c4a71fc6) + xlite-full - xlite-decode-only: main(c4a71fc6) + xlite-decode-only - diff1: Performance comparison between xlite-full and aclgraph - diff2: Performance comparison between xlite-decode-only and aclgraph ### Does this PR introduce _any_ user-facing change? Enable the xlite graph mode by setting xlite_graph_config: --additional-config='{"xlite_graph_config": {"enabled": true}}' # Enabled for decode only --additional-config='{"xlite_graph_config": {"enabled": true, "full_mode": true}}' # Enabled for prefill and decode - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: lulina <lina.lulina@huawei.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-08 08:27:46 +08:00
parent 8fdb689a32
commit 2be0fe2691
13 changed files with 553 additions and 3 deletions
--- a/vllm_ascend/ascend_config.py
+++ b/vllm_ascend/ascend_config.py
@@ -72,6 +72,10 @@ class AscendConfig:
        self.torchair_graph_config = TorchairGraphConfig(
            torchair_graph_config, vllm_config, additional_config)

+        xlite_graph_config = additional_config.get("xlite_graph_config", {})
+        self.xlite_graph_config = XliteGraphConfig(xlite_graph_config,
+                                                   vllm_config)
+
        ascend_compilation_config = additional_config.get(
            "ascend_compilation_config", {})
        self.ascend_compilation_config = AscendCompilationConfig(
@@ -291,6 +295,29 @@ class TorchairGraphConfig:
            )


+class XliteGraphConfig:
+    """
+    Configuration Object for xlite_graph_config from additional_config
+    """
+
+    def __init__(self, xlite_graph_config, vllm_config):
+        self.enabled = xlite_graph_config.get("enabled", False)
+        self.full_mode = xlite_graph_config.get("full_mode", False)
+        if self.enabled:
+            if bool(vllm_config.speculative_config):
+                raise RuntimeError(
+                    "Xlite graph mode is not compatible with speculative decoding. Please disable speculative decoding."
+                )
+            if vllm_config.parallel_config.pipeline_parallel_size > 1:
+                raise RuntimeError(
+                    "Xlite graph mode is not compatible with pipeline parallelism. Please set pipeline_parallel_size to 1."
+                )
+            if vllm_config.cache_config.block_size != 128:
+                raise RuntimeError(
+                    "Xlite graph mode is only compatible with block_size of 128. Please set block_size to 128."
+                )
+
+
 class DumpConfig:
    """
    Configuration object for dump/PrecisionDebugger settings.