[bugfix] align max_num_batched_tokens with tp*pcp when using FLASHCOMM1 (#6000)
### What this PR does / why we need it?
Align max_num_batched_tokens with tp*pcp when using FLASHCOMM1 to avoid
assert error in `NPUModelRunner._dummy_run`.
- vLLM version: v0.13.0
- vLLM main:
2c24bc6996
---------
Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
This commit is contained in:
@@ -176,12 +176,16 @@ class TestAscendMLAImpl(TestBase):
|
||||
vllm_config = MagicMock()
|
||||
speculative_config = MagicMock()
|
||||
model_config = MagicMock()
|
||||
parallel_config = MagicMock()
|
||||
parallel_config.prefill_context_parallel_size = 1
|
||||
parallel_config.tensor_parallel_size = 2
|
||||
speculative_config.num_speculative_tokens = 4
|
||||
vllm_config.speculative_config = speculative_config
|
||||
model_config.dtype = torch.float16
|
||||
vllm_config.model_config = model_config
|
||||
get_current_vllm_config.return_value = vllm_config
|
||||
vllm_config.additional_config = {"refresh": True}
|
||||
vllm_config.parallel_config = parallel_config
|
||||
init_ascend_config(vllm_config)
|
||||
|
||||
num_heads = 256
|
||||
|
||||
Reference in New Issue
Block a user