[PD] Support PD disaggregation with Prefill PP (#8846)
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com> Signed-off-by: Shangming Cai <csmthu@gmail.com> Co-authored-by: root <huzhiyuan@xiaohongshu.com> Co-authored-by: Ying Sheng <sqy1415@gmail.com> Co-authored-by: Francis <38564764+ssssnow@users.noreply.github.com> Co-authored-by: zitto <zhjc1124@gmail.com>
This commit is contained in:
@@ -307,8 +307,13 @@ class ModelRunner:
|
||||
self.start_layer = getattr(self.model, "start_layer", 0)
|
||||
self.end_layer = getattr(self.model, "end_layer", model_num_layers)
|
||||
self.num_effective_layers = self.end_layer - self.start_layer
|
||||
assert (not model_has_mtp_layers) or (
|
||||
self.num_effective_layers == model_num_layers
|
||||
assert (
|
||||
(not model_has_mtp_layers)
|
||||
or (self.spec_algorithm.is_none())
|
||||
or (
|
||||
(not self.spec_algorithm.is_none())
|
||||
and (self.num_effective_layers == model_num_layers)
|
||||
)
|
||||
), "PP is not compatible with MTP models."
|
||||
|
||||
# Apply torchao quantization
|
||||
@@ -1048,8 +1053,6 @@ class ModelRunner:
|
||||
else:
|
||||
num_layers = self.num_effective_layers
|
||||
if self.use_mla_backend:
|
||||
# FIXME: pipeline parallelism is not compatible with mla backend
|
||||
assert self.pp_size == 1
|
||||
cell_size = (
|
||||
(self.model_config.kv_lora_rank + self.model_config.qk_rope_head_dim)
|
||||
* num_layers
|
||||
|
||||
Reference in New Issue
Block a user