From f5754d12567b37ffebbaa5885c9e6f4977bdc0e1 Mon Sep 17 00:00:00 2001 From: ykcombat <99869808+ykcombat@users.noreply.github.com> Date: Sat, 11 Oct 2025 21:36:07 +0800 Subject: [PATCH] [Documentation][Configuration] Server args and documentation of PD-Multiplexing. (#11427) --- docs/advanced_features/pd_multiplexing.md | 56 +++++++++++++++++++++++ docs/index.rst | 1 + python/sglang/srt/server_args.py | 37 ++++++++++++++- 3 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 docs/advanced_features/pd_multiplexing.md diff --git a/docs/advanced_features/pd_multiplexing.md b/docs/advanced_features/pd_multiplexing.md new file mode 100644 index 000000000..9aecd70cd --- /dev/null +++ b/docs/advanced_features/pd_multiplexing.md @@ -0,0 +1,56 @@ + +# PD Multiplexing + + +## Server Arguments + +| Argument | Type/Default | Description | +|-----------------------------|-------------------------|----------------------------------------------------------| +| `--enable-pdmux` | flag; default: disabled | Enable PD-Multiplexing (PD running on greenctx stream). | +| `--pdmux-config-path `| string path; none | Path to the PD-Multiplexing YAML config file. | + +### YAML Configuration + +Example configuration for an H200 (132 SMs) + +```yaml +# Number of SM groups to divide the GPU into. +# Includes two default groups: +# - Group 0: all SMs for prefill +# - Last group: all SMs for decode +# The number of manual divisions must be (sm_group_num - 2). +sm_group_num: 8 + +# Optional manual divisions of SMs. +# Each entry contains: +# - prefill_sm: number of SMs allocated for prefill +# - decode_sm: number of SMs allocated for decode +# - decode_bs_threshold: minimum decode batch size to select this group +# +# The sum of `prefill_sm` and `decode_sm` must equal the total number of SMs. +# If provided, the number of entries must equal (sm_group_num - 2). +manual_divisions: + - [112, 20, 1] + - [104, 28, 5] + - [96, 36, 10] + - [80, 52, 15] + - [64, 68, 20] + - [56, 76, 25] + +# Divisor for default stream index calculation. +# Used when manual_divisions are not provided. +# Formula: +# stream_idx = max( +# 1, +# min(sm_group_num - 2, +# decode_bs * (sm_group_num - 2) // decode_bs_divisor +# ) +# ) +decode_bs_divisor: 36 + +# Maximum token budget for split_forward in the prefill stage. +# Determines how many layers are executed per split_forward. +# Formula: +# forward_count = max(1, split_forward_token_budget // extend_num_tokens) +split_forward_token_budget: 65536 +``` diff --git a/docs/index.rst b/docs/index.rst index 26dd96ee6..691bc8524 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -44,6 +44,7 @@ The core features include: advanced_features/quantization.md advanced_features/lora.ipynb advanced_features/pd_disaggregation.md + advanced_features/pd_multiplexing.md advanced_features/vlm_query.ipynb advanced_features/router.md advanced_features/observability.md diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index abb850160..c7e80a2b9 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -471,7 +471,8 @@ class ServerArgs: # For PD-Multiplexing enable_pdmux: bool = False - sm_group_num: int = 3 + pdmux_config_path: Optional[str] = None + sm_group_num: int = 8 def get_attention_backends(server_args): prefill_attention_backend_str = ( @@ -2893,6 +2894,12 @@ class ServerArgs: action="store_true", help="Enable PD-Multiplexing, PD running on greenctx stream.", ) + parser.add_argument( + "--pdmux-config-path", + type=str, + default=None, + help="The path of the PD-Multiplexing config file.", + ) parser.add_argument( "--sm-group-num", @@ -3021,6 +3028,34 @@ class ServerArgs: self.chunked_prefill_size % self.page_size == 0 ), "chunked_prefill_size must be divisible by page_size" + # Check pdmux + if self.enable_pdmux: + assert ( + self.pp_size == 1 + ), "PD-Multiplexing is only supported with pipeline parallelism disabled (pp_size=1)." + assert ( + self.chunked_prefill_size == -1 + ), "PD-Multiplexing is not compatible with chunked prefill." + assert ( + self.disaggregation_mode == "null" + ), "PD-Multiplexing is not compatible with disaggregation mode." + assert ( + self.disable_overlap_schedule + ), "PD-Multiplexing is not compatible with overlap schedule." + + # NOTE: CUDA Green Context may encounter potential issues with CudaGraph on torch 2.7.x – 2.8.x, leading to performance degradation. + import torch + + parts = torch.__version__.split("+", 1)[0].split(".") + major = int(parts[0]) if len(parts) > 0 and parts[0].isdigit() else 0 + minor = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0 + if (major, minor) > (2, 6): + logger.warning( + "WARNING: PD-Multiplexing may experience performance degradation with torch versions > 2.6.x.\n" + f" Current torch version is {torch.__version__}.\n" + " Please manually install torch 2.6.x." + ) + # Check multi tokenizer assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1" self.validate_buckets_rule(