From 30778f371b6e95492e927f9149b7ee6ff317258b Mon Sep 17 00:00:00 2001 From: dsxsteven <36877507+dsxsteven@users.noreply.github.com> Date: Thu, 25 Dec 2025 10:38:09 +0800 Subject: [PATCH] [BugFix] Fix num_pcp_pads Assignment Issues (#5273) ### What this PR does / why we need it? The variable `self.num_pcp_pads` was incorrectly truncated during assignment, causing errors in certain scenarios such as PD disaggregated. This issue has now been resolved. ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? Co-author by: QiuChunshuo - vLLM version: release/v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: daishixun Co-authored-by: weijinqian0 <1184188277@qq.com> --- .../e2e/multicard/long_sequence/test_basic.py | 134 ++++++++++++++++++ vllm_ascend/worker/model_runner_v1.py | 8 +- 2 files changed, 138 insertions(+), 4 deletions(-) diff --git a/tests/e2e/multicard/long_sequence/test_basic.py b/tests/e2e/multicard/long_sequence/test_basic.py index bc6f839a..f0b319b3 100644 --- a/tests/e2e/multicard/long_sequence/test_basic.py +++ b/tests/e2e/multicard/long_sequence/test_basic.py @@ -135,3 +135,137 @@ def test_models_pcp_dcp_piece_wise(): block_size=128, quantization="ascend") as runner: runner.model.generate(prompts, sampling_params) + + +@pytest.mark.skipif(vllm_version_is('0.12.0'), + reason="0.12.0 is not supported for context sequence.") +def test_pcp_basic(): + prompts = [ + "The capital of France is", "Hello, my name is Tom, I am", + "The president of United States is", "AI future is" + ] + model = "deepseek-ai/DeepSeek-V2-Lite-Chat" + sampling_params = SamplingParams(max_tokens=32, temperature=0.0) + with VllmRunner(model, + enforce_eager=True, + max_model_len=1024, + tensor_parallel_size=2, + prefill_context_parallel_size=2, + decode_context_parallel_size=1, + max_num_batched_tokens=1024, + enable_expert_parallel=True, + block_size=128) as runner: + runner.model.generate(prompts, sampling_params) + + +@pytest.mark.skipif(vllm_version_is('0.12.0'), + reason="0.12.0 is not supported for context sequence.") +def test_pcp_full_graph(): + prompts = [ + "The capital of France is", "Hello, my name is Tom, I am", + "The president of United States is", "AI future is" + ] + model = "deepseek-ai/DeepSeek-V2-Lite-Chat" + sampling_params = SamplingParams(max_tokens=32, temperature=0.0) + with VllmRunner(model, + enforce_eager=False, + max_model_len=1024, + tensor_parallel_size=2, + prefill_context_parallel_size=2, + decode_context_parallel_size=1, + max_num_batched_tokens=1024, + enable_expert_parallel=True, + block_size=128, + compilation_config={ + "cudagraph_mode": "FULL_DECODE_ONLY", + "cudagraph_capture_sizes": [4, 8, 24, 48, 60] + }) as runner: + runner.model.generate(prompts, sampling_params) + + +@pytest.mark.skipif(vllm_version_is('0.12.0'), + reason="0.12.0 is not supported for context sequence.") +def test_pcp_piece_wise(): + prompts = [ + "The capital of France is", "Hello, my name is Tom, I am", + "The president of United States is", "AI future is" + ] + model = "deepseek-ai/DeepSeek-V2-Lite-Chat" + sampling_params = SamplingParams(max_tokens=32, temperature=0.0) + with VllmRunner(model, + enforce_eager=False, + max_model_len=1024, + tensor_parallel_size=2, + prefill_context_parallel_size=2, + decode_context_parallel_size=1, + max_num_batched_tokens=1024, + enable_expert_parallel=True, + block_size=128) as runner: + runner.model.generate(prompts, sampling_params) + + +@pytest.mark.skipif(vllm_version_is('0.12.0'), + reason="0.12.0 is not supported for context sequence.") +def test_dcp_basic(): + prompts = [ + "The capital of France is", "Hello, my name is Tom, I am", + "The president of United States is", "AI future is" + ] + model = "deepseek-ai/DeepSeek-V2-Lite-Chat" + sampling_params = SamplingParams(max_tokens=32, temperature=0.0) + with VllmRunner(model, + enforce_eager=True, + max_model_len=1024, + tensor_parallel_size=4, + prefill_context_parallel_size=1, + decode_context_parallel_size=2, + max_num_batched_tokens=1024, + enable_expert_parallel=True, + block_size=128) as runner: + runner.model.generate(prompts, sampling_params) + + +@pytest.mark.skipif(vllm_version_is('0.12.0'), + reason="0.12.0 is not supported for context sequence.") +def test_dcp_full_graph(): + prompts = [ + "The capital of France is", "Hello, my name is Tom, I am", + "The president of United States is", "AI future is" + ] + model = "deepseek-ai/DeepSeek-V2-Lite-Chat" + sampling_params = SamplingParams(max_tokens=32, temperature=0.0) + with VllmRunner(model, + enforce_eager=False, + max_model_len=1024, + tensor_parallel_size=4, + prefill_context_parallel_size=1, + decode_context_parallel_size=2, + max_num_batched_tokens=1024, + enable_expert_parallel=True, + block_size=128, + compilation_config={ + "cudagraph_mode": "FULL_DECODE_ONLY", + "cudagraph_capture_sizes": [4, 8, 24, 48, 60] + }) as runner: + runner.model.generate(prompts, sampling_params) + + +@pytest.mark.skipif(vllm_version_is('0.12.0'), + reason="0.12.0 is not supported for context sequence.") +def test_dcp_piece_wise(): + prompts = [ + "The capital of France is", "Hello, my name is Tom, I am", + "The president of United States is", "AI future is" + ] + model = "deepseek-ai/DeepSeek-V2-Lite-Chat" + sampling_params = SamplingParams(max_tokens=32, temperature=0.0) + with VllmRunner(model, + enforce_eager=False, + max_model_len=1024, + tensor_parallel_size=4, + prefill_context_parallel_size=1, + decode_context_parallel_size=2, + max_num_batched_tokens=1024, + enable_expert_parallel=True, + block_size=128) as runner: + runner.model.generate(prompts, sampling_params) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 0eda5cb6..d67632b2 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -575,10 +575,10 @@ class NPUModelRunner(GPUModelRunner): tokens) num_scheduled_tokens = np.array(tokens, dtype=np.int32) total_num_scheduled_tokens = sum(num_scheduled_tokens[:num_reqs]) - total_num_pcp_pads = torch.sum(self.num_pcp_pads).item() + total_num_pcp_pads = torch.sum(self.num_pcp_pads[:num_reqs]).item() else: position_pcp, pcp_unpad_mask = None, None - self.num_pcp_pads = self.num_pcp_pads[:num_reqs] + self.num_pcp_pads[:num_reqs] = 0 max_num_scheduled_tokens = max(tokens) if not scheduler_output.scheduled_spec_decode_tokens: @@ -3050,7 +3050,6 @@ class NPUModelRunner(GPUModelRunner): def _update_tokens_for_pcp(self, tokens): num_reqs = self.input_batch.num_reqs - self.num_pcp_pads = self.num_pcp_pads[:num_reqs] tokens = np.array(tokens, dtype=np.int32) num_decode_reqs = (np.array(tokens) <= self.decode_threshold).sum() num_decode_tokens = sum(tokens[:num_decode_reqs]) @@ -3059,7 +3058,8 @@ class NPUModelRunner(GPUModelRunner): (2 * self.pcp_size)).astype(np.int32) * (2 * self.pcp_size) num_padded_scheduled_tokens[:num_decode_reqs] = ( tokens[:num_decode_reqs] * self.pcp_size) - self.num_pcp_pads = torch.tensor(num_padded_scheduled_tokens - tokens) + self.num_pcp_pads[:num_reqs] = torch.tensor( + num_padded_scheduled_tokens - tokens) cu_padded_tokens, pcp_padded_arange = \ self._get_cumsum_and_arange(num_padded_scheduled_tokens) unpad_mask = torch.from_numpy(