bugfix for mtp with multistream_moe (#3419)

### What this PR does / why we need it? when infer deepseek mtp layer with multistream_moe, we should pass a boolean to evaluate this feature and fix bugs when we are in mtp layer - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: zouyida2052 <zouyida2002@gmail.com>
2025-10-15 08:59:58 +08:00
parent c2c1db78a7
commit 3642b64afc
5 changed files with 22 additions and 11 deletions
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py
@@ -41,6 +41,7 @@ def test_mtp_torchair_correctness(
                            "use_cached_graph": False,
                            "graph_batch_sizes": [1, 2, 4],
                        },
+                        "multistream_overlap_shared_expert": "True"
                    }) as ref_llm:
        ref_outputs = ref_llm.generate(example_prompts, sampling_config)
    with VllmRunner(model_name,
@@ -60,7 +61,8 @@ def test_mtp_torchair_correctness(
                            "enabled": True,
                            "use_cached_graph": False,
                            "graph_batch_sizes": [1, 2, 4],
-                        }
+                        },
+                        "multistream_overlap_shared_expert": "True"
                    }) as spec_llm:
        spec_outputs = spec_llm.generate(example_prompts, sampling_config)

--- a/tests/ut/torchair/models/test_torchair_deepseek_mtp.py
+++ b/tests/ut/torchair/models/test_torchair_deepseek_mtp.py
@@ -17,6 +17,9 @@ class TestTorchairDeepSeekMultiTokenPredictorLayer(PytestBase):
        config = PretrainedConfig(vocab_size=1000,
                                  hidden_size=768,
                                  rms_norm_eps=1e-5)
+        mocker.patch(
+            'vllm_ascend.torchair.models.torchair_deepseek_mtp.get_tensor_model_parallel_world_size',
+            return_value=1)
        mocker.patch(
            "vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__",
            return_value=None)
@@ -56,6 +59,8 @@ class TestTorchairDeepSeekMultiTokenPredictorLayer(PytestBase):
        mocker.patch("torch.cat", return_value=torch.randn(2, 3, 768))
        mtp_layer.mtp_block.return_value = (torch.randn(2, 3, 768),
                                            torch.randn(2, 3, 768))
+        mtp_layer.enorm.return_value = torch.randn(2, 3, 768)
+        mtp_layer.hnorm.return_value = torch.randn(2, 3, 768)

        input_ids = torch.tensor([[1, 2, 3], [4, 5, 6]])
        positions = torch.tensor([[0, 1, 2], [0, 1, 2]])
@@ -65,7 +70,7 @@ class TestTorchairDeepSeekMultiTokenPredictorLayer(PytestBase):

        output = mtp_layer(input_ids, positions, kv_cache, None,
                           previous_hidden_states, inputs_embeds, 0)
-        assert output.shape == (2, 3, 768)
+        assert output.shape == (3, 768)


 class TestTorchairDeepSeekMultiTokenPredictor(PytestBase):