[BugFix] Fix num_pcp_pads Assignment Issues (#5273)

### What this PR does / why we need it?
The variable `self.num_pcp_pads` was incorrectly truncated during
assignment, causing errors in certain scenarios such as PD
disaggregated. This issue has now been resolved.
### Does this PR introduce _any_ user-facing change?
NO
### How was this patch tested?

Co-author by: QiuChunshuo <qiuchunshuo@huawei.com>

- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c

---------

Signed-off-by: daishixun <dsxsteven@sina.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
This commit is contained in:
dsxsteven
2025-12-25 10:38:09 +08:00
committed by GitHub
parent fca2f948c1
commit 30778f371b
2 changed files with 138 additions and 4 deletions

View File

@@ -135,3 +135,137 @@ def test_models_pcp_dcp_piece_wise():
block_size=128, block_size=128,
quantization="ascend") as runner: quantization="ascend") as runner:
runner.model.generate(prompts, sampling_params) runner.model.generate(prompts, sampling_params)
@pytest.mark.skipif(vllm_version_is('0.12.0'),
reason="0.12.0 is not supported for context sequence.")
def test_pcp_basic():
prompts = [
"The capital of France is", "Hello, my name is Tom, I am",
"The president of United States is", "AI future is"
]
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model,
enforce_eager=True,
max_model_len=1024,
tensor_parallel_size=2,
prefill_context_parallel_size=2,
decode_context_parallel_size=1,
max_num_batched_tokens=1024,
enable_expert_parallel=True,
block_size=128) as runner:
runner.model.generate(prompts, sampling_params)
@pytest.mark.skipif(vllm_version_is('0.12.0'),
reason="0.12.0 is not supported for context sequence.")
def test_pcp_full_graph():
prompts = [
"The capital of France is", "Hello, my name is Tom, I am",
"The president of United States is", "AI future is"
]
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model,
enforce_eager=False,
max_model_len=1024,
tensor_parallel_size=2,
prefill_context_parallel_size=2,
decode_context_parallel_size=1,
max_num_batched_tokens=1024,
enable_expert_parallel=True,
block_size=128,
compilation_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
}) as runner:
runner.model.generate(prompts, sampling_params)
@pytest.mark.skipif(vllm_version_is('0.12.0'),
reason="0.12.0 is not supported for context sequence.")
def test_pcp_piece_wise():
prompts = [
"The capital of France is", "Hello, my name is Tom, I am",
"The president of United States is", "AI future is"
]
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model,
enforce_eager=False,
max_model_len=1024,
tensor_parallel_size=2,
prefill_context_parallel_size=2,
decode_context_parallel_size=1,
max_num_batched_tokens=1024,
enable_expert_parallel=True,
block_size=128) as runner:
runner.model.generate(prompts, sampling_params)
@pytest.mark.skipif(vllm_version_is('0.12.0'),
reason="0.12.0 is not supported for context sequence.")
def test_dcp_basic():
prompts = [
"The capital of France is", "Hello, my name is Tom, I am",
"The president of United States is", "AI future is"
]
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model,
enforce_eager=True,
max_model_len=1024,
tensor_parallel_size=4,
prefill_context_parallel_size=1,
decode_context_parallel_size=2,
max_num_batched_tokens=1024,
enable_expert_parallel=True,
block_size=128) as runner:
runner.model.generate(prompts, sampling_params)
@pytest.mark.skipif(vllm_version_is('0.12.0'),
reason="0.12.0 is not supported for context sequence.")
def test_dcp_full_graph():
prompts = [
"The capital of France is", "Hello, my name is Tom, I am",
"The president of United States is", "AI future is"
]
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model,
enforce_eager=False,
max_model_len=1024,
tensor_parallel_size=4,
prefill_context_parallel_size=1,
decode_context_parallel_size=2,
max_num_batched_tokens=1024,
enable_expert_parallel=True,
block_size=128,
compilation_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
}) as runner:
runner.model.generate(prompts, sampling_params)
@pytest.mark.skipif(vllm_version_is('0.12.0'),
reason="0.12.0 is not supported for context sequence.")
def test_dcp_piece_wise():
prompts = [
"The capital of France is", "Hello, my name is Tom, I am",
"The president of United States is", "AI future is"
]
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model,
enforce_eager=False,
max_model_len=1024,
tensor_parallel_size=4,
prefill_context_parallel_size=1,
decode_context_parallel_size=2,
max_num_batched_tokens=1024,
enable_expert_parallel=True,
block_size=128) as runner:
runner.model.generate(prompts, sampling_params)

View File

@@ -575,10 +575,10 @@ class NPUModelRunner(GPUModelRunner):
tokens) tokens)
num_scheduled_tokens = np.array(tokens, dtype=np.int32) num_scheduled_tokens = np.array(tokens, dtype=np.int32)
total_num_scheduled_tokens = sum(num_scheduled_tokens[:num_reqs]) total_num_scheduled_tokens = sum(num_scheduled_tokens[:num_reqs])
total_num_pcp_pads = torch.sum(self.num_pcp_pads).item() total_num_pcp_pads = torch.sum(self.num_pcp_pads[:num_reqs]).item()
else: else:
position_pcp, pcp_unpad_mask = None, None position_pcp, pcp_unpad_mask = None, None
self.num_pcp_pads = self.num_pcp_pads[:num_reqs] self.num_pcp_pads[:num_reqs] = 0
max_num_scheduled_tokens = max(tokens) max_num_scheduled_tokens = max(tokens)
if not scheduler_output.scheduled_spec_decode_tokens: if not scheduler_output.scheduled_spec_decode_tokens:
@@ -3050,7 +3050,6 @@ class NPUModelRunner(GPUModelRunner):
def _update_tokens_for_pcp(self, tokens): def _update_tokens_for_pcp(self, tokens):
num_reqs = self.input_batch.num_reqs num_reqs = self.input_batch.num_reqs
self.num_pcp_pads = self.num_pcp_pads[:num_reqs]
tokens = np.array(tokens, dtype=np.int32) tokens = np.array(tokens, dtype=np.int32)
num_decode_reqs = (np.array(tokens) <= self.decode_threshold).sum() num_decode_reqs = (np.array(tokens) <= self.decode_threshold).sum()
num_decode_tokens = sum(tokens[:num_decode_reqs]) num_decode_tokens = sum(tokens[:num_decode_reqs])
@@ -3059,7 +3058,8 @@ class NPUModelRunner(GPUModelRunner):
(2 * self.pcp_size)).astype(np.int32) * (2 * self.pcp_size) (2 * self.pcp_size)).astype(np.int32) * (2 * self.pcp_size)
num_padded_scheduled_tokens[:num_decode_reqs] = ( num_padded_scheduled_tokens[:num_decode_reqs] = (
tokens[:num_decode_reqs] * self.pcp_size) tokens[:num_decode_reqs] * self.pcp_size)
self.num_pcp_pads = torch.tensor(num_padded_scheduled_tokens - tokens) self.num_pcp_pads[:num_reqs] = torch.tensor(
num_padded_scheduled_tokens - tokens)
cu_padded_tokens, pcp_padded_arange = \ cu_padded_tokens, pcp_padded_arange = \
self._get_cumsum_and_arange(num_padded_scheduled_tokens) self._get_cumsum_and_arange(num_padded_scheduled_tokens)
unpad_mask = torch.from_numpy( unpad_mask = torch.from_numpy(