From f5754d12567b37ffebbaa5885c9e6f4977bdc0e1 Mon Sep 17 00:00:00 2001
From: ykcombat <99869808+ykcombat@users.noreply.github.com>
Date: Sat, 11 Oct 2025 21:36:07 +0800
Subject: [PATCH] [Documentation][Configuration] Server args and documentation
 of PD-Multiplexing. (#11427)

---
 docs/advanced_features/pd_multiplexing.md | 56 +++++++++++++++++++++++
 docs/index.rst                            |  1 +
 python/sglang/srt/server_args.py          | 37 ++++++++++++++-
 3 files changed, 93 insertions(+), 1 deletion(-)
 create mode 100644 docs/advanced_features/pd_multiplexing.md
diff --git a/docs/advanced_features/pd_multiplexing.md b/docs/advanced_features/pd_multiplexing.md
new file mode 100644
index 000000000..9aecd70cd
--- /dev/null
+++ b/docs/advanced_features/pd_multiplexing.md
@@ -0,0 +1,56 @@
+
+# PD Multiplexing
+
+
+## Server Arguments
+
+| Argument                     | Type/Default            | Description                                              |
+|-----------------------------|-------------------------|----------------------------------------------------------|
+| `--enable-pdmux`            | flag; default: disabled | Enable PD-Multiplexing (PD running on greenctx stream).  |
+| `--pdmux-config-path <path>`| string path; none       | Path to the PD-Multiplexing YAML config file.            |
+
+### YAML Configuration
+
+Example configuration for an H200 (132 SMs)
+
+```yaml
+# Number of SM groups to divide the GPU into.
+# Includes two default groups:
+#   - Group 0: all SMs for prefill
+#   - Last group: all SMs for decode
+# The number of manual divisions must be (sm_group_num - 2).
+sm_group_num: 8
+
+# Optional manual divisions of SMs.
+# Each entry contains:
+#   - prefill_sm: number of SMs allocated for prefill
+#   - decode_sm: number of SMs allocated for decode
+#   - decode_bs_threshold: minimum decode batch size to select this group
+#
+# The sum of `prefill_sm` and `decode_sm` must equal the total number of SMs.
+# If provided, the number of entries must equal (sm_group_num - 2).
+manual_divisions:
+  - [112, 20, 1]
+  - [104, 28, 5]
+  - [96, 36, 10]
+  - [80, 52, 15]
+  - [64, 68, 20]
+  - [56, 76, 25]
+
+# Divisor for default stream index calculation.
+# Used when manual_divisions are not provided.
+# Formula:
+#   stream_idx = max(
+#       1,
+#       min(sm_group_num - 2,
+#           decode_bs * (sm_group_num - 2) // decode_bs_divisor
+#       )
+#   )
+decode_bs_divisor: 36
+
+# Maximum token budget for split_forward in the prefill stage.
+# Determines how many layers are executed per split_forward.
+# Formula:
+#   forward_count = max(1, split_forward_token_budget // extend_num_tokens)
+split_forward_token_budget: 65536
+```
diff --git a/docs/index.rst b/docs/index.rst
index 26dd96ee6..691bc8524 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -44,6 +44,7 @@ The core features include:
    advanced_features/quantization.md
    advanced_features/lora.ipynb
    advanced_features/pd_disaggregation.md
+   advanced_features/pd_multiplexing.md
    advanced_features/vlm_query.ipynb
    advanced_features/router.md
    advanced_features/observability.md
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index abb850160..c7e80a2b9 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -471,7 +471,8 @@ class ServerArgs:
 
     # For PD-Multiplexing
     enable_pdmux: bool = False
-    sm_group_num: int = 3
+    pdmux_config_path: Optional[str] = None
+    sm_group_num: int = 8
 
     def get_attention_backends(server_args):
         prefill_attention_backend_str = (
@@ -2893,6 +2894,12 @@ class ServerArgs:
             action="store_true",
             help="Enable PD-Multiplexing, PD running on greenctx stream.",
         )
+        parser.add_argument(
+            "--pdmux-config-path",
+            type=str,
+            default=None,
+            help="The path of the PD-Multiplexing config file.",
+        )
 
         parser.add_argument(
             "--sm-group-num",
@@ -3021,6 +3028,34 @@ class ServerArgs:
                 self.chunked_prefill_size % self.page_size == 0
             ), "chunked_prefill_size must be divisible by page_size"
 
+        # Check pdmux
+        if self.enable_pdmux:
+            assert (
+                self.pp_size == 1
+            ), "PD-Multiplexing is only supported with pipeline parallelism disabled (pp_size=1)."
+            assert (
+                self.chunked_prefill_size == -1
+            ), "PD-Multiplexing is not compatible with chunked prefill."
+            assert (
+                self.disaggregation_mode == "null"
+            ), "PD-Multiplexing is not compatible with disaggregation mode."
+            assert (
+                self.disable_overlap_schedule
+            ), "PD-Multiplexing is not compatible with overlap schedule."
+
+            # NOTE: CUDA Green Context may encounter potential issues with CudaGraph on torch 2.7.x – 2.8.x, leading to performance degradation.
+            import torch
+
+            parts = torch.__version__.split("+", 1)[0].split(".")
+            major = int(parts[0]) if len(parts) > 0 and parts[0].isdigit() else 0
+            minor = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
+            if (major, minor) > (2, 6):
+                logger.warning(
+                    "WARNING: PD-Multiplexing may experience performance degradation with torch versions > 2.6.x.\n"
+                    f"  Current torch version is {torch.__version__}.\n"
+                    "  Please manually install torch 2.6.x."
+                )
+
         # Check multi tokenizer
         assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
         self.validate_buckets_rule(