bugfix for mtp with multistream_moe (#3419)
### What this PR does / why we need it? when infer deepseek mtp layer with multistream_moe, we should pass a boolean to evaluate this feature and fix bugs when we are in mtp layer - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: zouyida2052 <zouyida2002@gmail.com>
This commit is contained in:
@@ -41,6 +41,7 @@ def test_mtp_torchair_correctness(
|
||||
"use_cached_graph": False,
|
||||
"graph_batch_sizes": [1, 2, 4],
|
||||
},
|
||||
"multistream_overlap_shared_expert": "True"
|
||||
}) as ref_llm:
|
||||
ref_outputs = ref_llm.generate(example_prompts, sampling_config)
|
||||
with VllmRunner(model_name,
|
||||
@@ -60,7 +61,8 @@ def test_mtp_torchair_correctness(
|
||||
"enabled": True,
|
||||
"use_cached_graph": False,
|
||||
"graph_batch_sizes": [1, 2, 4],
|
||||
}
|
||||
},
|
||||
"multistream_overlap_shared_expert": "True"
|
||||
}) as spec_llm:
|
||||
spec_outputs = spec_llm.generate(example_prompts, sampling_config)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user