[V1][PP] Support pp with ray backend in V1 (#1800)

### What this PR does / why we need it? Support pipeline parallel with ray backend in V1Engine. Fixes #1751 ### Does this PR introduce _any_ user-facing change? Users could specify ray as distributed backend when inferencing with pp ### How was this patch tested? CI passed with new added test. - vLLM version: v0.9.2 - vLLM main: 32142b3c62 --------- Signed-off-by: MengqingCao <cmq0113@163.com>
2025-07-23 14:52:52 +08:00
parent 9a3bdf2162
commit 3aa3b46bfe
5 changed files with 32 additions and 18 deletions
--- a/tests/ut/attention/test_attention_v1.py
+++ b/tests/ut/attention/test_attention_v1.py
@@ -400,19 +400,13 @@ class TestAscendAttentionBackendImpl(TestBase):
        layer = self.layer_no_quant
        mock_vanilla_prefill.return_value = MagicMock()

-        def mock_tensor(data, device=None, **kwargs):
-            if device == "npu":
-                return metadata.attn_mask
-            return torch.tensor(data, **kwargs)
-
-        with patch("torch.tensor", side_effect=mock_tensor):
-            output = self.impl_192.forward(layer,
-                                           query,
-                                           key,
-                                           value,
-                                           kv_cache,
-                                           metadata,
-                                           trace_flag=False)
+        output = self.impl_192.forward(layer,
+                                       query,
+                                       key,
+                                       value,
+                                       kv_cache,
+                                       metadata,
+                                       trace_flag=False)

        mock_vanilla_prefill.assert_called_once()
        assert output.shape == (10, 8 * 192)