[Feat] Shared expert dp for deepseek and deepseek_mtp (#3495)

### What this PR does / why we need it?
shared expert dp for deepseek and deepseek_mtp, could be combined with
sp to improve performance.

### How was this patch tested?

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

---------

Signed-off-by: zhaozx-cn <zhaozx2116@163.com>
Co-authored-by: realliujiaxu <realliujiaxu@163.com>
This commit is contained in:
zhaozx-cn
2025-10-17 15:06:37 +08:00
committed by GitHub
parent d9ee491f70
commit bf87606932
9 changed files with 57 additions and 10 deletions

View File

@@ -57,6 +57,8 @@ class TestCustomDeepSeekMultiTokenPredictorLayer(PytestBase):
'eh_proj',
return_value=torch.randn(2, 3, 768))
mocker.patch("torch.cat", return_value=torch.randn(2, 3, 768))
mocker.patch("torch.ops.vllm.maybe_all_gather_and_maybe_unpad",
lambda x, label: x)
mtp_layer.mtp_block.return_value = (torch.randn(2, 3, 768),
torch.randn(2, 3, 768))
@@ -182,6 +184,8 @@ class TestCustomDeepSeekMTP(PytestBase):
assert isinstance(mtp, CustomDeepSeekMTP)
def test_forward(self, mocker: MockerFixture, setup_mtp):
mocker.patch("torch.ops.vllm.maybe_all_gather_and_maybe_unpad",
lambda x, label: x)
input_ids = torch.tensor([[1, 2, 3]])
positions = torch.tensor([[0, 1, 2]])
kv_caches = [torch.tensor([[0.1, 0.2, 0.3]])]

View File

@@ -1,4 +1,5 @@
import unittest
from unittest.mock import patch
import pytest
import torch
@@ -53,7 +54,9 @@ class TestAscendRMSNorm(PytestBase):
# Test case for the most common and basic scenario
@pytest.mark.parametrize(
"residual", [None, torch.randn(4, 8, dtype=torch.float16)])
def test_forward_oot_basic(self, residual):
@patch("torch.ops.vllm.maybe_chunk_residual")
def test_forward_oot_basic(self, mock_maybe_chunk_residual, residual):
mock_maybe_chunk_residual.side_effect = lambda x, residual: residual
layer = RMSNorm(hidden_size=8, eps=1e-05)
x = torch.randn(4, 8, dtype=torch.float16)
if residual is not None:
@@ -117,6 +120,8 @@ class TestAscendRMSNorm(PytestBase):
mock_forward_context.layer_idx = 0
mock_forward_context.num_hidden_layers = num_hidden_layers
mock_forward_context.fusion_linear = "gate_up_dense"
mocker.patch("torch.ops.vllm.maybe_chunk_residual",
lambda x, residual: residual)
# Ensure fusion and layer_idx increment are handled correctly
x = torch.randn(4, 8, dtype=torch.float16)