[V1][PP] Support pp with ray backend in V1 (#1800)

### What this PR does / why we need it? Support pipeline parallel with ray backend in V1Engine. Fixes #1751 ### Does this PR introduce _any_ user-facing change? Users could specify ray as distributed backend when inferencing with pp ### How was this patch tested? CI passed with new added test. - vLLM version: v0.9.2 - vLLM main: 32142b3c62 --------- Signed-off-by: MengqingCao <cmq0113@163.com>
2025-07-23 14:52:52 +08:00
parent 9a3bdf2162
commit 3aa3b46bfe
5 changed files with 32 additions and 18 deletions
--- a/tests/e2e/multicard/test_pipeline_parallel.py
+++ b/tests/e2e/multicard/test_pipeline_parallel.py
@@ -24,6 +24,7 @@ MODELS = [

 TENSOR_PARALLELS = [2]
 PIPELINE_PARALLELS = [2]
+DIST_EXECUTOR_BACKEND = ["mp", "ray"]

 prompts = [
    "Hello, my name is",
@@ -34,10 +35,13 @@ prompts = [
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
@pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS)
-def test_models(model: str, tp_size: int, pp_size: int) -> None:
+@pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND)
+def test_models(model: str, tp_size: int, pp_size: int,
+                distributed_executor_backend: str) -> None:
    with VllmRunner(model,
                    tensor_parallel_size=tp_size,
                    pipeline_parallel_size=pp_size,
+                    distributed_executor_backend=distributed_executor_backend,
                    enforce_eager=True,
                    gpu_memory_utilization=0.7) as vllm_model:
        vllm_model.generate_greedy(prompts, 64)