bugfix for mtp with multistream_moe (#3419)
### What this PR does / why we need it? when infer deepseek mtp layer with multistream_moe, we should pass a boolean to evaluate this feature and fix bugs when we are in mtp layer - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: zouyida2052 <zouyida2002@gmail.com>
This commit is contained in:
@@ -17,6 +17,9 @@ class TestTorchairDeepSeekMultiTokenPredictorLayer(PytestBase):
|
||||
config = PretrainedConfig(vocab_size=1000,
|
||||
hidden_size=768,
|
||||
rms_norm_eps=1e-5)
|
||||
mocker.patch(
|
||||
'vllm_ascend.torchair.models.torchair_deepseek_mtp.get_tensor_model_parallel_world_size',
|
||||
return_value=1)
|
||||
mocker.patch(
|
||||
"vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__",
|
||||
return_value=None)
|
||||
@@ -56,6 +59,8 @@ class TestTorchairDeepSeekMultiTokenPredictorLayer(PytestBase):
|
||||
mocker.patch("torch.cat", return_value=torch.randn(2, 3, 768))
|
||||
mtp_layer.mtp_block.return_value = (torch.randn(2, 3, 768),
|
||||
torch.randn(2, 3, 768))
|
||||
mtp_layer.enorm.return_value = torch.randn(2, 3, 768)
|
||||
mtp_layer.hnorm.return_value = torch.randn(2, 3, 768)
|
||||
|
||||
input_ids = torch.tensor([[1, 2, 3], [4, 5, 6]])
|
||||
positions = torch.tensor([[0, 1, 2], [0, 1, 2]])
|
||||
@@ -65,7 +70,7 @@ class TestTorchairDeepSeekMultiTokenPredictorLayer(PytestBase):
|
||||
|
||||
output = mtp_layer(input_ids, positions, kv_cache, None,
|
||||
previous_hidden_states, inputs_embeds, 0)
|
||||
assert output.shape == (2, 3, 768)
|
||||
assert output.shape == (3, 768)
|
||||
|
||||
|
||||
class TestTorchairDeepSeekMultiTokenPredictor(PytestBase):
|
||||
|
||||
Reference in New Issue
Block a user