[Bugfix]v0.18.0 support FlashComm1 & DCP for Qwen (#7726)
### What this PR does / why we need it? This PR backports the changes from #7673 ([Bugfix] support FlashComm1 & DCP for Qwen) to the releases/v0.18.0 branch. -------- Signed-off-by: Yang Yuxi <907276627@qq.com>
This commit is contained in:
@@ -236,7 +236,8 @@ def test_dcp_basic():
|
|||||||
decode_context_parallel_size=2,
|
decode_context_parallel_size=2,
|
||||||
max_num_batched_tokens=1024,
|
max_num_batched_tokens=1024,
|
||||||
enable_expert_parallel=True,
|
enable_expert_parallel=True,
|
||||||
block_size=128) as runner:
|
block_size=128,
|
||||||
|
compilation_config={"pass_config": {"enable_sp": True}}) as runner:
|
||||||
runner.model.generate(prompts, sampling_params)
|
runner.model.generate(prompts, sampling_params)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1284,7 +1284,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
if (
|
if (
|
||||||
cudagraph_mode == CUDAGraphMode.FULL
|
cudagraph_mode == CUDAGraphMode.FULL
|
||||||
or (enable_sp() and not self.model_config.use_mla)
|
or (enable_sp() and not self.model_config.use_mla)
|
||||||
and self.pcp_size == 1 # TODO(lxs): fix this
|
and self.pcp_size * self.dcp_size == 1
|
||||||
):
|
):
|
||||||
# Currently, Graph Mode and SP will both pad num_tokens,
|
# Currently, Graph Mode and SP will both pad num_tokens,
|
||||||
# Another possible condition is num_tokens_padded != num_tokens_unpadded
|
# Another possible condition is num_tokens_padded != num_tokens_unpadded
|
||||||
|
|||||||
Reference in New Issue
Block a user