[Documentation][Configuration] Server args and documentation of PD-Multiplexing. (#11427)

2025-10-11 21:36:07 +08:00
parent 739daa63e4
commit f5754d1256
3 changed files with 93 additions and 1 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -471,7 +471,8 @@ class ServerArgs:

    # For PD-Multiplexing
    enable_pdmux: bool = False
-    sm_group_num: int = 3
+    pdmux_config_path: Optional[str] = None
+    sm_group_num: int = 8

    def get_attention_backends(server_args):
        prefill_attention_backend_str = (
@@ -2893,6 +2894,12 @@ class ServerArgs:
            action="store_true",
            help="Enable PD-Multiplexing, PD running on greenctx stream.",
        )
+        parser.add_argument(
+            "--pdmux-config-path",
+            type=str,
+            default=None,
+            help="The path of the PD-Multiplexing config file.",
+        )

        parser.add_argument(
            "--sm-group-num",
@@ -3021,6 +3028,34 @@ class ServerArgs:
                self.chunked_prefill_size % self.page_size == 0
            ), "chunked_prefill_size must be divisible by page_size"

+        # Check pdmux
+        if self.enable_pdmux:
+            assert (
+                self.pp_size == 1
+            ), "PD-Multiplexing is only supported with pipeline parallelism disabled (pp_size=1)."
+            assert (
+                self.chunked_prefill_size == -1
+            ), "PD-Multiplexing is not compatible with chunked prefill."
+            assert (
+                self.disaggregation_mode == "null"
+            ), "PD-Multiplexing is not compatible with disaggregation mode."
+            assert (
+                self.disable_overlap_schedule
+            ), "PD-Multiplexing is not compatible with overlap schedule."
+
+            # NOTE: CUDA Green Context may encounter potential issues with CudaGraph on torch 2.7.x – 2.8.x, leading to performance degradation.
+            import torch
+
+            parts = torch.__version__.split("+", 1)[0].split(".")
+            major = int(parts[0]) if len(parts) > 0 and parts[0].isdigit() else 0
+            minor = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
+            if (major, minor) > (2, 6):
+                logger.warning(
+                    "WARNING: PD-Multiplexing may experience performance degradation with torch versions > 2.6.x.\n"
+                    f"  Current torch version is {torch.__version__}.\n"
+                    "  Please manually install torch 2.6.x."
+                )
+
        # Check multi tokenizer
        assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
        self.validate_buckets_rule(