[feat][torchair] support super kernel feat for quantized dsr1 (#3485)

### What this PR does / why we need it? Port #1916 and #2157 to master branch to fuse operators in deepseek moe layers, which can reduce scheduling overhead on devices. Note that this feature is valid only when `tp_size = 1` and `multistream_overlap_shared_expert` is enabled with torchair graph mode. ### Does this PR introduce _any_ user-facing change? Users can enable this feature with `--additional-config '{"torchair_graph_config":{"enabled":true, "enable_super_kernel":true}, "multistream_overlap_shared_expert":true}'`. ### How was this patch tested? E2E deepseek serving with 2P1D disaggregated prefill scenarios. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: linfeng-yuan <1102311262@qq.com>
2025-10-20 20:04:37 +08:00
parent 70bef33f13
commit 068ed706c8
8 changed files with 138 additions and 86 deletions
--- a/tests/ut/ops/test_linear.py
+++ b/tests/ut/ops/test_linear.py
@@ -56,17 +56,18 @@ class TestAscendUnquantizedLinearMethod(TestBase):

    def setUp(self):
        self.method = AscendUnquantizedLinearMethod()
+        self.layer = mock.MagicMock()
+        mock_dtype = mock.PropertyMock(return_value=torch.float16)
+        type(self.layer.weight.data).dtype = mock_dtype

    @mock.patch("vllm_ascend.ops.linear.is_enable_nz")
    @mock.patch("torch_npu.npu_format_cast")
    @mock.patch("torch.version")
    def test_process_weights_after_loading_is_8_3_enable_nz(
            self, mock_version, mock_format_cast, mock_is_nz):
-        layer = mock.MagicMock()
-
        mock_version.cann = "8.3.RC1"
        mock_is_nz.return_value = 1
-        self.method.process_weights_after_loading(layer)
+        self.method.process_weights_after_loading(self.layer)
        mock_format_cast.assert_called_once()

    @mock.patch("vllm_ascend.ops.linear.is_enable_nz")
@@ -74,23 +75,19 @@ class TestAscendUnquantizedLinearMethod(TestBase):
    @mock.patch("torch.version")
    def test_process_weights_after_loading_is_8_3_disable_nz(
            self, mock_version, mock_format_cast, mock_is_nz):
-        layer = mock.MagicMock()
-
        mock_version.cann = "8.3.RC1"
        mock_is_nz.return_value = 0
-        self.method.process_weights_after_loading(layer)
+        self.method.process_weights_after_loading(self.layer)
        mock_format_cast.assert_not_called()

    @mock.patch("vllm_ascend.ops.linear.is_enable_nz")
    @mock.patch("torch.version")
    def test_process_weights_after_loading_not_8_3(self, mock_version,
                                                   mock_is_nz):
-        layer = mock.MagicMock()
-
        mock_version.cann = "8.2.RC1"
        mock_is_nz.return_value = 1
        # Should not raise exception
-        self.method.process_weights_after_loading(layer)
+        self.method.process_weights_after_loading(self.layer)


 class TestAscendRowParallelLinear(BaseLinearTest):