[Documentation][Configuration] Server args and documentation of PD-Multiplexing. (#11427)
This commit is contained in:
56
docs/advanced_features/pd_multiplexing.md
Normal file
56
docs/advanced_features/pd_multiplexing.md
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
|
||||||
|
# PD Multiplexing
|
||||||
|
|
||||||
|
|
||||||
|
## Server Arguments
|
||||||
|
|
||||||
|
| Argument | Type/Default | Description |
|
||||||
|
|-----------------------------|-------------------------|----------------------------------------------------------|
|
||||||
|
| `--enable-pdmux` | flag; default: disabled | Enable PD-Multiplexing (PD running on greenctx stream). |
|
||||||
|
| `--pdmux-config-path <path>`| string path; none | Path to the PD-Multiplexing YAML config file. |
|
||||||
|
|
||||||
|
### YAML Configuration
|
||||||
|
|
||||||
|
Example configuration for an H200 (132 SMs)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Number of SM groups to divide the GPU into.
|
||||||
|
# Includes two default groups:
|
||||||
|
# - Group 0: all SMs for prefill
|
||||||
|
# - Last group: all SMs for decode
|
||||||
|
# The number of manual divisions must be (sm_group_num - 2).
|
||||||
|
sm_group_num: 8
|
||||||
|
|
||||||
|
# Optional manual divisions of SMs.
|
||||||
|
# Each entry contains:
|
||||||
|
# - prefill_sm: number of SMs allocated for prefill
|
||||||
|
# - decode_sm: number of SMs allocated for decode
|
||||||
|
# - decode_bs_threshold: minimum decode batch size to select this group
|
||||||
|
#
|
||||||
|
# The sum of `prefill_sm` and `decode_sm` must equal the total number of SMs.
|
||||||
|
# If provided, the number of entries must equal (sm_group_num - 2).
|
||||||
|
manual_divisions:
|
||||||
|
- [112, 20, 1]
|
||||||
|
- [104, 28, 5]
|
||||||
|
- [96, 36, 10]
|
||||||
|
- [80, 52, 15]
|
||||||
|
- [64, 68, 20]
|
||||||
|
- [56, 76, 25]
|
||||||
|
|
||||||
|
# Divisor for default stream index calculation.
|
||||||
|
# Used when manual_divisions are not provided.
|
||||||
|
# Formula:
|
||||||
|
# stream_idx = max(
|
||||||
|
# 1,
|
||||||
|
# min(sm_group_num - 2,
|
||||||
|
# decode_bs * (sm_group_num - 2) // decode_bs_divisor
|
||||||
|
# )
|
||||||
|
# )
|
||||||
|
decode_bs_divisor: 36
|
||||||
|
|
||||||
|
# Maximum token budget for split_forward in the prefill stage.
|
||||||
|
# Determines how many layers are executed per split_forward.
|
||||||
|
# Formula:
|
||||||
|
# forward_count = max(1, split_forward_token_budget // extend_num_tokens)
|
||||||
|
split_forward_token_budget: 65536
|
||||||
|
```
|
||||||
@@ -44,6 +44,7 @@ The core features include:
|
|||||||
advanced_features/quantization.md
|
advanced_features/quantization.md
|
||||||
advanced_features/lora.ipynb
|
advanced_features/lora.ipynb
|
||||||
advanced_features/pd_disaggregation.md
|
advanced_features/pd_disaggregation.md
|
||||||
|
advanced_features/pd_multiplexing.md
|
||||||
advanced_features/vlm_query.ipynb
|
advanced_features/vlm_query.ipynb
|
||||||
advanced_features/router.md
|
advanced_features/router.md
|
||||||
advanced_features/observability.md
|
advanced_features/observability.md
|
||||||
|
|||||||
@@ -471,7 +471,8 @@ class ServerArgs:
|
|||||||
|
|
||||||
# For PD-Multiplexing
|
# For PD-Multiplexing
|
||||||
enable_pdmux: bool = False
|
enable_pdmux: bool = False
|
||||||
sm_group_num: int = 3
|
pdmux_config_path: Optional[str] = None
|
||||||
|
sm_group_num: int = 8
|
||||||
|
|
||||||
def get_attention_backends(server_args):
|
def get_attention_backends(server_args):
|
||||||
prefill_attention_backend_str = (
|
prefill_attention_backend_str = (
|
||||||
@@ -2893,6 +2894,12 @@ class ServerArgs:
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="Enable PD-Multiplexing, PD running on greenctx stream.",
|
help="Enable PD-Multiplexing, PD running on greenctx stream.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--pdmux-config-path",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="The path of the PD-Multiplexing config file.",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--sm-group-num",
|
"--sm-group-num",
|
||||||
@@ -3021,6 +3028,34 @@ class ServerArgs:
|
|||||||
self.chunked_prefill_size % self.page_size == 0
|
self.chunked_prefill_size % self.page_size == 0
|
||||||
), "chunked_prefill_size must be divisible by page_size"
|
), "chunked_prefill_size must be divisible by page_size"
|
||||||
|
|
||||||
|
# Check pdmux
|
||||||
|
if self.enable_pdmux:
|
||||||
|
assert (
|
||||||
|
self.pp_size == 1
|
||||||
|
), "PD-Multiplexing is only supported with pipeline parallelism disabled (pp_size=1)."
|
||||||
|
assert (
|
||||||
|
self.chunked_prefill_size == -1
|
||||||
|
), "PD-Multiplexing is not compatible with chunked prefill."
|
||||||
|
assert (
|
||||||
|
self.disaggregation_mode == "null"
|
||||||
|
), "PD-Multiplexing is not compatible with disaggregation mode."
|
||||||
|
assert (
|
||||||
|
self.disable_overlap_schedule
|
||||||
|
), "PD-Multiplexing is not compatible with overlap schedule."
|
||||||
|
|
||||||
|
# NOTE: CUDA Green Context may encounter potential issues with CudaGraph on torch 2.7.x – 2.8.x, leading to performance degradation.
|
||||||
|
import torch
|
||||||
|
|
||||||
|
parts = torch.__version__.split("+", 1)[0].split(".")
|
||||||
|
major = int(parts[0]) if len(parts) > 0 and parts[0].isdigit() else 0
|
||||||
|
minor = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
|
||||||
|
if (major, minor) > (2, 6):
|
||||||
|
logger.warning(
|
||||||
|
"WARNING: PD-Multiplexing may experience performance degradation with torch versions > 2.6.x.\n"
|
||||||
|
f" Current torch version is {torch.__version__}.\n"
|
||||||
|
" Please manually install torch 2.6.x."
|
||||||
|
)
|
||||||
|
|
||||||
# Check multi tokenizer
|
# Check multi tokenizer
|
||||||
assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
|
assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
|
||||||
self.validate_buckets_rule(
|
self.validate_buckets_rule(
|
||||||
|
|||||||
Reference in New Issue
Block a user