[V1][PP] Support pp with ray backend in V1 (#1800)
### What this PR does / why we need it?
Support pipeline parallel with ray backend in V1Engine.
Fixes #1751
### Does this PR introduce _any_ user-facing change?
Users could specify ray as distributed backend when inferencing with pp
### How was this patch tested?
CI passed with new added test.
- vLLM version: v0.9.2
- vLLM main:
32142b3c62
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
@@ -396,8 +396,10 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
||||
if self.head_size == 192:
|
||||
cu_seqlen_q = [0] + attn_metadata.query_lens.tolist()
|
||||
cu_seqlen_k = [0] + attn_metadata.seq_lens.tolist()
|
||||
cu_seqlen_q = torch.tensor(cu_seqlen_q, device="npu")
|
||||
cu_seqlen_k = torch.tensor(cu_seqlen_k, device="npu")
|
||||
cu_seqlen_q = torch.tensor(cu_seqlen_q,
|
||||
device=query.device)
|
||||
cu_seqlen_k = torch.tensor(cu_seqlen_k,
|
||||
device=query.device)
|
||||
cu_seqlen_q = torch.cumsum(cu_seqlen_q, dim=0)
|
||||
cu_seqlen_k = torch.cumsum(cu_seqlen_k, dim=0)
|
||||
max_seqlen_q = torch.max(attn_metadata.query_lens)
|
||||
|
||||
@@ -233,7 +233,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.spec_attn_mask = torch.triu(torch.ones(2048,
|
||||
2048,
|
||||
dtype=torch.bool),
|
||||
diagonal=1).to("npu")
|
||||
diagonal=1).to(self.device)
|
||||
if get_pp_group().is_last_rank:
|
||||
if self.speculative_config.method == "ngram":
|
||||
self.drafter = NgramProposer(self.vllm_config)
|
||||
@@ -1120,6 +1120,19 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
input_ids = self.input_ids[:padded_batch_size]
|
||||
positions = self.positions[:padded_batch_size]
|
||||
|
||||
if get_pp_group().is_first_rank:
|
||||
intermediate_tensors = None
|
||||
else:
|
||||
assert intermediate_tensors is not None
|
||||
assert self.intermediate_tensors is not None
|
||||
for k, v in intermediate_tensors.items():
|
||||
self.intermediate_tensors[k][:num_input_tokens].copy_(
|
||||
v[:num_input_tokens], non_blocking=True)
|
||||
intermediate_tensors = IntermediateTensors({
|
||||
k: v[:num_input_tokens]
|
||||
for k, v in self.intermediate_tensors.items()
|
||||
})
|
||||
|
||||
# Run forward pass
|
||||
with set_forward_context(attn_metadata,
|
||||
self.vllm_config,
|
||||
|
||||
Reference in New Issue
Block a user