### What this PR does / why we need it?
For offline scenarios, adjust the scheduling process to prioritize the
prefill phase of all requests, then process the decode phase of all
requests.
### How was this patch tested?
```
max_num_seqs=24,
additional_config={
"ascend_scheduler_config":{
"enabled": True,
"enable_pd_transfer": True,
"decode_max_num_seqs": 24,
"enable_chunked_prefill": False
}
},
```
| input | output | num prompts | max_num_seqs | dp | tp | scheduler |
tps |
| ------ | ------ | ---------- | ---------------- | ---- | ---- |
---------------- | --------------- |
| dapo-math-17K | 2K | 384 | 24 | 2 | 1 | v1 | 234.06 |
| dapo-math-17K | 2K | 384 | 24 | 2 | 1 | pd transfer | 239.59(+2.4%) |
| dapo-math-17K| 2K | 384 | 24 | 4 | 1 | v1 | 222.85 |
| dapo-math-17K| 2K | 384 | 24 | 4 | 1 | pd transfer | 225.81(+1.3%) |
- vLLM version: v0.10.1.1
- vLLM main:
6fb2788163
---------
Signed-off-by: CaranLic <740821011@qq.com>
36 lines
1.5 KiB
Python
36 lines
1.5 KiB
Python
import torch
|
|
from vllm.config import VllmConfig
|
|
from vllm.v1.sample.logits_processor import MinPLogitsProcessor
|
|
|
|
|
|
class AscendMinPLogitsProcessor(MinPLogitsProcessor):
|
|
|
|
def __init__(self, vllm_config: "VllmConfig", device: torch.device,
|
|
is_pin_memory: bool):
|
|
super().__init__(vllm_config, device, is_pin_memory)
|
|
|
|
decode_max_num_seqs = getattr(vllm_config.scheduler_config,
|
|
'decode_max_num_seqs', 0)
|
|
if decode_max_num_seqs != 0:
|
|
max_num_reqs = max(vllm_config.scheduler_config.max_num_seqs,
|
|
decode_max_num_seqs)
|
|
|
|
self.min_p_count: int = 0
|
|
|
|
self.min_p_cpu_tensor = torch.zeros((max_num_reqs, ),
|
|
dtype=torch.float32,
|
|
device="cpu",
|
|
pin_memory=is_pin_memory)
|
|
self.min_p_cpu = self.min_p_cpu_tensor.numpy()
|
|
|
|
self.use_double_tensor = torch.device(device).type != "cpu"
|
|
|
|
if self.use_double_tensor:
|
|
# Pre-allocated device tensor
|
|
self.min_p_device: torch.Tensor = torch.empty(
|
|
(max_num_reqs, ), dtype=torch.float32, device=device)
|
|
else:
|
|
self.min_p_device = self.min_p_cpu_tensor
|
|
# Current slice of the device tensor
|
|
self.min_p: torch.Tensor = self.min_p_device[:0]
|