From 3640c60b0eb4d4cb104e20bfa406d3f1d17920a7 Mon Sep 17 00:00:00 2001 From: sdmyzlp <117554856+sdmyzlp@users.noreply.github.com> Date: Sat, 7 Jun 2025 14:28:20 +0800 Subject: [PATCH] Avoid unfused Transpose in DeepSeekV3 EP256 MoE layer (#1091) ### What this PR does / why we need it? View optimization in torchair (defaulted to on for Transpose with any of its axis being 1) prevents the weight Transpose to be fused with later GroupedMatmul, which decrease the performance of MoE layer when expert parallelism equals the total number of experts (e.g. EP256 for DSKv3). Add an option to solve this problem by disabling the optimization. ### Does this PR introduce _any_ user-facing change? Controlled by `additional_config.torchair_graph_config.enable_view_optimize`, defaulted to `True`. ### How was this patch tested? Tested on 1x16 910 node, with tailored 2 layer DSKv2. Signed-off-by: sdmyzlp --- docs/source/user_guide/additional_config.md | 1 + vllm_ascend/ascend_config.py | 2 ++ vllm_ascend/worker/model_runner.py | 2 ++ vllm_ascend/worker/model_runner_v1.py | 2 ++ 4 files changed, 7 insertions(+) diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md index df39789..a884bda 100644 --- a/docs/source/user_guide/additional_config.md +++ b/docs/source/user_guide/additional_config.md @@ -38,6 +38,7 @@ The details of each config option are as follows: | Name | Type | Default | Description | | ---- | ---- | ------- | ----------- | | `enabled` | bool | `False` | Whether to enable torchair graph mode | +| `enable_view_optimize` | bool | `True` | Whether to enable torchair view optimization | | `use_cached_graph` | bool | `False` | Whether to use cached graph | | `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache | | `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty | diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 41ebbde..065b7d0 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -55,6 +55,8 @@ class TorchairGraphConfig: "graph_batch_sizes_init", False) self.enable_multistream_shared_expert = torchair_graph_config.get( "enable_multistream_shared_expert", False) + self.enable_view_optimize = torchair_graph_config.get( + "enable_view_optimize", True) if not isinstance(self.graph_batch_sizes, list): raise TypeError("graph_batch_sizes must be list[int]") diff --git a/vllm_ascend/worker/model_runner.py b/vllm_ascend/worker/model_runner.py index 43059b8..48c5d4b 100644 --- a/vllm_ascend/worker/model_runner.py +++ b/vllm_ascend/worker/model_runner.py @@ -1037,6 +1037,8 @@ class NPUModelRunnerBase(ModelRunnerBase[TModelInputForNPU]): config = torchair.CompilerConfig() config.experimental_config.frozen_parameter = True config.experimental_config.tiling_schedule_optimize = True + config.experimental_config.enable_view_optimize = \ + get_ascend_config().torchair_graph_config.enable_view_optimize torch.npu.set_compile_mode(jit_compile=False) if not self.use_cached_npu_graph: npu_backend = torchair.get_npu_backend(compiler_config=config) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 3dda021..2f2e5c5 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1286,6 +1286,8 @@ class NPUModelRunner(LoRAModelRunnerMixin): config = torchair.CompilerConfig() config.experimental_config.frozen_parameter = True config.experimental_config.tiling_schedule_optimize = True + config.experimental_config.enable_view_optimize = \ + get_ascend_config().torchair_graph_config.enable_view_optimize torch.npu.set_compile_mode(jit_compile=False) if not self.use_cached_npu_graph: npu_backend = torchair.get_npu_backend(compiler_config=config)