[Documentation][Configuration] Server args and documentation of PD-Multiplexing. (#11427)

2025-10-11 21:36:07 +08:00
parent 739daa63e4
commit f5754d1256
3 changed files with 93 additions and 1 deletions
--- a/docs/advanced_features/pd_multiplexing.md
+++ b/docs/advanced_features/pd_multiplexing.md
@@ -0,0 +1,56 @@
 # PD Multiplexing
 ## Server Arguments
 | Argument                     | Type/Default            | Description                                              |
 |-----------------------------|-------------------------|----------------------------------------------------------|
 | `--enable-pdmux`            | flag; default: disabled | Enable PD-Multiplexing (PD running on greenctx stream).  |
 | `--pdmux-config-path <path>`| string path; none       | Path to the PD-Multiplexing YAML config file.            |
 ### YAML Configuration
 Example configuration for an H200 (132 SMs)
 ```yaml
 # Number of SM groups to divide the GPU into.
 # Includes two default groups:
 #   - Group 0: all SMs for prefill
 #   - Last group: all SMs for decode
 # The number of manual divisions must be (sm_group_num - 2).
 sm_group_num: 8
 # Optional manual divisions of SMs.
 # Each entry contains:
 #   - prefill_sm: number of SMs allocated for prefill
 #   - decode_sm: number of SMs allocated for decode
 #   - decode_bs_threshold: minimum decode batch size to select this group
 #
 # The sum of `prefill_sm` and `decode_sm` must equal the total number of SMs.
 # If provided, the number of entries must equal (sm_group_num - 2).
 manual_divisions:
  - [112, 20, 1]
  - [104, 28, 5]
  - [96, 36, 10]
  - [80, 52, 15]
  - [64, 68, 20]
  - [56, 76, 25]
 # Divisor for default stream index calculation.
 # Used when manual_divisions are not provided.
 # Formula:
 #   stream_idx = max(
 #       1,
 #       min(sm_group_num - 2,
 #           decode_bs * (sm_group_num - 2) // decode_bs_divisor
 #       )
 #   )
 decode_bs_divisor: 36
 # Maximum token budget for split_forward in the prefill stage.
 # Determines how many layers are executed per split_forward.
 # Formula:
 #   forward_count = max(1, split_forward_token_budget // extend_num_tokens)
 split_forward_token_budget: 65536
 ```
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -44,6 +44,7 @@ The core features include:
   advanced_features/quantization.md
   advanced_features/lora.ipynb
   advanced_features/pd_disaggregation.md
   advanced_features/pd_multiplexing.md
   advanced_features/vlm_query.ipynb
   advanced_features/router.md
   advanced_features/observability.md
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -471,7 +471,8 @@ class ServerArgs:
    # For PD-Multiplexing
    enable_pdmux: bool = False
-    sm_group_num: int = 3
+    pdmux_config_path: Optional[str] = None
    sm_group_num: int = 8
    def get_attention_backends(server_args):
        prefill_attention_backend_str = (
@@ -2893,6 +2894,12 @@ class ServerArgs:
            action="store_true",
            help="Enable PD-Multiplexing, PD running on greenctx stream.",
        )
        parser.add_argument(
            "--pdmux-config-path",
            type=str,
            default=None,
            help="The path of the PD-Multiplexing config file.",
        )
        parser.add_argument(
            "--sm-group-num",
@@ -3021,6 +3028,34 @@ class ServerArgs:
                self.chunked_prefill_size % self.page_size == 0
            ), "chunked_prefill_size must be divisible by page_size"
        # Check pdmux
        if self.enable_pdmux:
            assert (
                self.pp_size == 1
            ), "PD-Multiplexing is only supported with pipeline parallelism disabled (pp_size=1)."
            assert (
                self.chunked_prefill_size == -1
            ), "PD-Multiplexing is not compatible with chunked prefill."
            assert (
                self.disaggregation_mode == "null"
            ), "PD-Multiplexing is not compatible with disaggregation mode."
            assert (
                self.disable_overlap_schedule
            ), "PD-Multiplexing is not compatible with overlap schedule."
            # NOTE: CUDA Green Context may encounter potential issues with CudaGraph on torch 2.7.x – 2.8.x, leading to performance degradation.
            import torch
            parts = torch.__version__.split("+", 1)[0].split(".")
            major = int(parts[0]) if len(parts) > 0 and parts[0].isdigit() else 0
            minor = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
            if (major, minor) > (2, 6):
                logger.warning(
                    "WARNING: PD-Multiplexing may experience performance degradation with torch versions > 2.6.x.\n"
                    f"  Current torch version is {torch.__version__}.\n"
                    "  Please manually install torch 2.6.x."
                )
        # Check multi tokenizer
        assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
        self.validate_buckets_rule(