[Feat] Add Euler xlite graph wrapper support (#4526)
### What this PR does / why we need it? This patch adds support for the xlite graph wrapper to vllm_ascend. Xlite provides operator implementations of the transformer network on Ascend hardware. For details about xlite, please refer to the following link: https://gitee.com/openeuler/GVirt/blob/master/xlite/README.md The latest performance comparison data between xlite and the default aclgraph mode is as follows: ## Qwen3 32B TPS 910B3(A2) Online Inference Performance Comparison - aclgraph: main(c4a71fc6) - xlite-full: main(c4a71fc6) + xlite-full - xlite-decode-only: main(c4a71fc6) + xlite-decode-only - diff1: Performance comparison between xlite-full and aclgraph - diff2: Performance comparison between xlite-decode-only and aclgraph ### Does this PR introduce _any_ user-facing change? Enable the xlite graph mode by setting xlite_graph_config: --additional-config='{"xlite_graph_config": {"enabled": true}}' # Enabled for decode only --additional-config='{"xlite_graph_config": {"enabled": true, "full_mode": true}}' # Enabled for prefill and decode - vLLM version: v0.12.0 - vLLM main:ad32e3e19c--------- Signed-off-by: lulina <lina.lulina@huawei.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -72,6 +72,10 @@ class AscendConfig:
|
||||
self.torchair_graph_config = TorchairGraphConfig(
|
||||
torchair_graph_config, vllm_config, additional_config)
|
||||
|
||||
xlite_graph_config = additional_config.get("xlite_graph_config", {})
|
||||
self.xlite_graph_config = XliteGraphConfig(xlite_graph_config,
|
||||
vllm_config)
|
||||
|
||||
ascend_compilation_config = additional_config.get(
|
||||
"ascend_compilation_config", {})
|
||||
self.ascend_compilation_config = AscendCompilationConfig(
|
||||
@@ -291,6 +295,29 @@ class TorchairGraphConfig:
|
||||
)
|
||||
|
||||
|
||||
class XliteGraphConfig:
|
||||
"""
|
||||
Configuration Object for xlite_graph_config from additional_config
|
||||
"""
|
||||
|
||||
def __init__(self, xlite_graph_config, vllm_config):
|
||||
self.enabled = xlite_graph_config.get("enabled", False)
|
||||
self.full_mode = xlite_graph_config.get("full_mode", False)
|
||||
if self.enabled:
|
||||
if bool(vllm_config.speculative_config):
|
||||
raise RuntimeError(
|
||||
"Xlite graph mode is not compatible with speculative decoding. Please disable speculative decoding."
|
||||
)
|
||||
if vllm_config.parallel_config.pipeline_parallel_size > 1:
|
||||
raise RuntimeError(
|
||||
"Xlite graph mode is not compatible with pipeline parallelism. Please set pipeline_parallel_size to 1."
|
||||
)
|
||||
if vllm_config.cache_config.block_size != 128:
|
||||
raise RuntimeError(
|
||||
"Xlite graph mode is only compatible with block_size of 128. Please set block_size to 128."
|
||||
)
|
||||
|
||||
|
||||
class DumpConfig:
|
||||
"""
|
||||
Configuration object for dump/PrecisionDebugger settings.
|
||||
|
||||
Reference in New Issue
Block a user