[bugfix](CP) Fix and unify the PD request discrimination logic. (#5939)

### What this PR does / why we need it?
Since the PR (https://github.com/vllm-project/vllm/pull/32118) has
modified the criteria for judging Prefill and Decode requests in vLLM,
PCPManager needs to synchronize with this standard. As PCPManager
involves multiple calculations of PD request counts, this PR attempts to
consolidate the related logic and update the PD request count once per
batch.

### How was this patch tested?
```bash
pytest tests/e2e/multicard/4-cards/long_sequence/test_mtp.py
```

- vLLM version: v0.13.0
- vLLM main:
11b6af5280

Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
This commit is contained in:
Qiu
2026-01-31 10:26:02 +08:00
committed by GitHub
parent 4230bc8646
commit 638cae824d
4 changed files with 88 additions and 111 deletions

View File

@@ -141,8 +141,10 @@ def test_update_tokens_for_pcp_basic(tokens, num_reqs, num_computed_tokens,
dtype=np.int32)
input_batch.num_prompt_tokens = np.array(num_prompt_tokens, dtype=np.int32)
arange_np = np.arange(10000)
num_scheduled_tokens = np.array(tokens)
pcp_manager.init_batch_info(num_scheduled_tokens, num_reqs)
pcp_tokens_result, positions_result = pcp_manager.update_tokens_for_pcp(
np.array(tokens), arange_np, num_reqs, 1)
num_scheduled_tokens, arange_np)
assert np.array_equal(pcp_tokens_result, expected_pcp_tokens), \
f"Expected pcp_tokens: {expected_pcp_tokens}, got: {pcp_tokens_result}"
@@ -305,8 +307,8 @@ def test_generate_pcp_mtp_input(
for i, token_ids_tensor in enumerate(token_ids_tensor_list):
token_ids_cpu_tensor[i][:token_ids_tensor.size(0)] = token_ids_tensor
pcp_manager.generate_pcp_mtp_input(num_reqs, total_num_scheduled_tokens,
num_scheduled_tokens, False,
pcp_manager.init_batch_info(np.array(list(num_scheduled_tokens.values())), num_reqs)
pcp_manager.generate_pcp_mtp_input(total_num_scheduled_tokens, num_scheduled_tokens, False,
input_batch, arange_np)
assert torch.equal(
pcp_manager.input_ids_pcp_full.cpu[:total_num_scheduled_tokens],