[PD] Support PD disaggregation with Prefill PP (#8846)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
Signed-off-by: Shangming Cai <csmthu@gmail.com>
Co-authored-by: root <huzhiyuan@xiaohongshu.com>
Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Francis <38564764+ssssnow@users.noreply.github.com>
Co-authored-by: zitto <zhjc1124@gmail.com>
This commit is contained in:
Shangming Cai
2025-08-17 09:31:31 +08:00
committed by GitHub
parent 6a9d6ca33c
commit 384f8ab5ce
11 changed files with 632 additions and 82 deletions

View File

@@ -307,8 +307,13 @@ class ModelRunner:
self.start_layer = getattr(self.model, "start_layer", 0)
self.end_layer = getattr(self.model, "end_layer", model_num_layers)
self.num_effective_layers = self.end_layer - self.start_layer
assert (not model_has_mtp_layers) or (
self.num_effective_layers == model_num_layers
assert (
(not model_has_mtp_layers)
or (self.spec_algorithm.is_none())
or (
(not self.spec_algorithm.is_none())
and (self.num_effective_layers == model_num_layers)
)
), "PP is not compatible with MTP models."
# Apply torchao quantization
@@ -1048,8 +1053,6 @@ class ModelRunner:
else:
num_layers = self.num_effective_layers
if self.use_mla_backend:
# FIXME: pipeline parallelism is not compatible with mla backend
assert self.pp_size == 1
cell_size = (
(self.model_config.kv_lora_rank + self.model_config.qk_rope_head_dim)
* num_layers