diff --git a/tests/e2e/multicard/4-cards/long_sequence/test_basic.py b/tests/e2e/multicard/4-cards/long_sequence/test_basic.py index 40f5f700..70317003 100644 --- a/tests/e2e/multicard/4-cards/long_sequence/test_basic.py +++ b/tests/e2e/multicard/4-cards/long_sequence/test_basic.py @@ -236,7 +236,8 @@ def test_dcp_basic(): decode_context_parallel_size=2, max_num_batched_tokens=1024, enable_expert_parallel=True, - block_size=128) as runner: + block_size=128, + compilation_config={"pass_config": {"enable_sp": True}}) as runner: runner.model.generate(prompts, sampling_params) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 36546b26..7d9710f1 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1284,7 +1284,7 @@ class NPUModelRunner(GPUModelRunner): if ( cudagraph_mode == CUDAGraphMode.FULL or (enable_sp() and not self.model_config.use_mla) - and self.pcp_size == 1 # TODO(lxs): fix this + and self.pcp_size * self.dcp_size == 1 ): # Currently, Graph Mode and SP will both pad num_tokens, # Another possible condition is num_tokens_padded != num_tokens_unpadded