[Documentation][Configuration] Server args and documentation of PD-Multiplexing. (#11427)
This commit is contained in:
@@ -471,7 +471,8 @@ class ServerArgs:
|
||||
|
||||
# For PD-Multiplexing
|
||||
enable_pdmux: bool = False
|
||||
sm_group_num: int = 3
|
||||
pdmux_config_path: Optional[str] = None
|
||||
sm_group_num: int = 8
|
||||
|
||||
def get_attention_backends(server_args):
|
||||
prefill_attention_backend_str = (
|
||||
@@ -2893,6 +2894,12 @@ class ServerArgs:
|
||||
action="store_true",
|
||||
help="Enable PD-Multiplexing, PD running on greenctx stream.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pdmux-config-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The path of the PD-Multiplexing config file.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--sm-group-num",
|
||||
@@ -3021,6 +3028,34 @@ class ServerArgs:
|
||||
self.chunked_prefill_size % self.page_size == 0
|
||||
), "chunked_prefill_size must be divisible by page_size"
|
||||
|
||||
# Check pdmux
|
||||
if self.enable_pdmux:
|
||||
assert (
|
||||
self.pp_size == 1
|
||||
), "PD-Multiplexing is only supported with pipeline parallelism disabled (pp_size=1)."
|
||||
assert (
|
||||
self.chunked_prefill_size == -1
|
||||
), "PD-Multiplexing is not compatible with chunked prefill."
|
||||
assert (
|
||||
self.disaggregation_mode == "null"
|
||||
), "PD-Multiplexing is not compatible with disaggregation mode."
|
||||
assert (
|
||||
self.disable_overlap_schedule
|
||||
), "PD-Multiplexing is not compatible with overlap schedule."
|
||||
|
||||
# NOTE: CUDA Green Context may encounter potential issues with CudaGraph on torch 2.7.x – 2.8.x, leading to performance degradation.
|
||||
import torch
|
||||
|
||||
parts = torch.__version__.split("+", 1)[0].split(".")
|
||||
major = int(parts[0]) if len(parts) > 0 and parts[0].isdigit() else 0
|
||||
minor = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
|
||||
if (major, minor) > (2, 6):
|
||||
logger.warning(
|
||||
"WARNING: PD-Multiplexing may experience performance degradation with torch versions > 2.6.x.\n"
|
||||
f" Current torch version is {torch.__version__}.\n"
|
||||
" Please manually install torch 2.6.x."
|
||||
)
|
||||
|
||||
# Check multi tokenizer
|
||||
assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
|
||||
self.validate_buckets_rule(
|
||||
|
||||
Reference in New Issue
Block a user