[main] add pd transfer for ascend scheduler (#2753)

### What this PR does / why we need it? For offline scenarios, adjust the scheduling process to prioritize the prefill phase of all requests, then process the decode phase of all requests. ### How was this patch tested? ``` max_num_seqs=24, additional_config={ "ascend_scheduler_config":{ "enabled": True, "enable_pd_transfer": True, "decode_max_num_seqs": 24, "enable_chunked_prefill": False } }, ``` | input | output | num prompts | max_num_seqs | dp | tp | scheduler | tps | | ------ | ------ | ---------- | ---------------- | ---- | ---- | ---------------- | --------------- | | dapo-math-17K | 2K | 384 | 24 | 2 | 1 | v1 | 234.06 | | dapo-math-17K | 2K | 384 | 24 | 2 | 1 | pd transfer | 239.59(+2.4%) | | dapo-math-17K| 2K | 384 | 24 | 4 | 1 | v1 | 222.85 | | dapo-math-17K| 2K | 384 | 24 | 4 | 1 | pd transfer | 225.81(+1.3%) | - vLLM version: v0.10.1.1 - vLLM main: 6fb2788163 --------- Signed-off-by: CaranLic <740821011@qq.com>
2025-09-10 08:46:39 +08:00
parent edf1f600ad
commit 168ad600b5
9 changed files with 216 additions and 4 deletions
--- a/tests/ut/core/test_schedule_config.py
+++ b/tests/ut/core/test_schedule_config.py
@@ -165,3 +165,16 @@ class TestAscendSchedulerConfig(TestBase):
        )
        self.assertIn("max_num_batched_tokens (2048)", str(context.exception))
        self.assertIn("max_model_len (4096)", str(context.exception))
+
+    def test_initialize_from_config_with_pd_transfer(self):
+        ascend_config = AscendSchedulerConfig.initialize_from_config(
+            self.basic_scheduler_config,
+            AscendSchedulerConfig(
+                enable_pd_transfer=True,
+                decode_max_num_seqs=48,
+                max_num_batched_tokens=4096,
+                max_model_len=4096,
+            ),
+        )
+        self.assertEqual(ascend_config.enable_pd_transfer, True)
+        self.assertEqual(ascend_config.decode_max_num_seqs, 48)