[Disaggregated Prefill] P2P Disaggregated Prefill based on llm_datadist (#694)

### What this PR does / why we need it? - This PR proposes a P2P version of Disaggregated Prefill based on llm_datadist which manages data transfer. - This solution reconstructs previous offline single-node Disaggregated Prefill solution, and supports multi-node and online serveing now. - Currently this solution supports 1P1D situation of Deepseek hybrid parallelism (P: TP+EP, D: DP+EP). Note that xPyD situation is considered in the solution design, and will be supported soon within v1 engine. --------- Signed-off-by: hw_whx <wanghexiang7@huawei.com> Signed-off-by: ganyi <pleaplusone.gy@gmail.com> Co-authored-by: hw_whx <wanghexiang7@huawei.com> Co-authored-by: ganyi <pleaplusone.gy@gmail.com>
2025-05-01 22:31:36 +08:00
parent 84e2ed898b
commit 8b194ad12e
18 changed files with 1769 additions and 32 deletions
--- a/tests/singlecard/spec_decode/e2e/test_mtp_correctness.py
+++ b/tests/singlecard/spec_decode/e2e/test_mtp_correctness.py
@@ -123,7 +123,7 @@ def test_mtp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
        "model_name": QUANT_MODEL,

        # GPU memory utilization
-        "gpu_memory_utilization": 0.85
+        "gpu_memory_utilization": 0.8
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -169,7 +169,7 @@ def test_mtp_e2e_quant_greedy_correctness(vllm_runner, common_llm_kwargs,
        "model_name": FLOAT_MODEL,

        # GPU memory utilization
-        "gpu_memory_utilization": 0.85
+        "gpu_memory_utilization": 0.8
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -230,7 +230,7 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,

        # Main model
        "model_name": FLOAT_MODEL,
-        "gpu_memory_utilization": 0.85
+        "gpu_memory_utilization": 0.8
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -274,7 +274,7 @@ def test_mtp_e2e_greedy_correctness_torchair_graph(

        # Main model
        "model_name": QUANT_MODEL,
-        "gpu_memory_utilization": 0.85
+        "gpu_memory_utilization": 0.8
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -322,7 +322,7 @@ def test_mtp_e2e_quant_greedy_correctness_torchair_graph(
        "model_name": FLOAT_MODEL,

        # GPU memory utilization
-        "gpu_memory_utilization": 0.9
+        "gpu_memory_utilization": 0.8
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -369,7 +369,7 @@ def test_mtp_e2e_greedy_correctness_with_preemption(
        "model_name": FLOAT_MODEL,

        # GPU memory utilization
-        "gpu_memory_utilization": 0.9
+        "gpu_memory_utilization": 0.8
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -420,7 +420,7 @@ def test_mtp_different_k(vllm_runner, common_llm_kwargs,
        "model_name": FLOAT_MODEL,

        # GPU memory utilization
-        "gpu_memory_utilization": 0.9
+        "gpu_memory_utilization": 0.8
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])