Files
CaranLic 168ad600b5 [main] add pd transfer for ascend scheduler (#2753)
### What this PR does / why we need it?
For offline scenarios, adjust the scheduling process to prioritize the
prefill phase of all requests, then process the decode phase of all
requests.

### How was this patch tested?

```
max_num_seqs=24,
additional_config={
    "ascend_scheduler_config":{
        "enabled": True,
        "enable_pd_transfer": True,
        "decode_max_num_seqs": 24,
        "enable_chunked_prefill": False
    }
},
```
| input | output | num prompts | max_num_seqs | dp | tp | scheduler |
tps |
| ------ | ------ | ---------- | ---------------- | ---- | ---- |
---------------- | --------------- |
| dapo-math-17K | 2K | 384 | 24 | 2 | 1 | v1 | 234.06 |
| dapo-math-17K | 2K | 384 | 24 | 2 | 1 | pd transfer | 239.59(+2.4%) |
| dapo-math-17K| 2K | 384 | 24 | 4 | 1 | v1 | 222.85 |
| dapo-math-17K| 2K | 384 | 24 | 4 | 1 | pd transfer | 225.81(+1.3%) |


- vLLM version: v0.10.1.1
- vLLM main:
6fb2788163

---------

Signed-off-by: CaranLic <740821011@qq.com>
2025-09-10 08:46:39 +08:00

41 lines
1.8 KiB
Python

import torch
from pytest_mock import MockerFixture
from vllm.config import SchedulerConfig, VllmConfig
from tests.ut.base import PytestBase
from vllm_ascend.sample.logits_processor import AscendMinPLogitsProcessor
class TestMinPLogitsProcessorInitFunc(PytestBase):
def test_init_func_with_decode_max_num_seqs(self, mocker: MockerFixture):
device_cpu = torch.device("cpu")
device_npu = torch.device("npu")
is_pin_memory = False
mock_vllm_config = mocker.MagicMock(spec=VllmConfig)
mock_scheduler_config = mocker.MagicMock(spec=SchedulerConfig)
mock_scheduler_config.decode_max_num_seqs = 0
mock_scheduler_config.max_num_seqs = 128
mock_vllm_config.scheduler_config = mock_scheduler_config
# torch.zeros/torch.empty returns error on online ut machine, so mock it
mock_tensor = torch.zeros((256, ),
dtype=torch.float32,
pin_memory=False)
mocker.patch("torch.zeros", return_value=mock_tensor)
mock_empty_tensor = torch.empty((256, ), dtype=torch.float32)
mocker.patch("torch.empty", return_value=mock_empty_tensor)
processor_cpu = AscendMinPLogitsProcessor(mock_vllm_config, device_cpu,
is_pin_memory)
assert processor_cpu.min_p is not None
assert processor_cpu.use_double_tensor is False
assert processor_cpu.min_p_cpu.shape[0] == 256
processor_cpu = AscendMinPLogitsProcessor(mock_vllm_config, device_npu,
is_pin_memory)
assert processor_cpu.min_p is not None
assert processor_cpu.use_double_tensor is True
assert processor_cpu.min_p_cpu.shape[0] == 256