diff --git a/docs/advanced_features/server_arguments.md b/docs/advanced_features/server_arguments.md index e76533df7..1f1f801ac 100644 --- a/docs/advanced_features/server_arguments.md +++ b/docs/advanced_features/server_arguments.md @@ -136,7 +136,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s | `--device` | The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified. | None | | `--tp-size` | The tensor parallelism size. | 1 | | `--pp-size` | The pipeline parallelism size. | 1 | -| `--max-micro-batch-size` | The maximum micro batch size in pipeline parallelism. | None | +| `--pp-max-micro-batch-size` | The maximum micro batch size in pipeline parallelism. | None | | `--stream-interval` | The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher. | 1 | | `--stream-output` | Whether to output as a sequence of disjoint segments. | False | | `--random-seed` | The random seed. | None | diff --git a/python/sglang/srt/entrypoints/http_server.py b/python/sglang/srt/entrypoints/http_server.py index 60f68905e..9ba8e6374 100644 --- a/python/sglang/srt/entrypoints/http_server.py +++ b/python/sglang/srt/entrypoints/http_server.py @@ -494,7 +494,7 @@ async def get_load(): # example usage: -# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"max_micro_batch_size": 8}}' +# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"pp_max_micro_batch_size": 8}}' @app.api_route("/set_internal_state", methods=["POST", "PUT"]) async def set_internal_state(obj: SetInternalStateReq, request: Request): res = await _global_state.tokenizer_manager.set_internal_state(obj) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index fba066433..af14d95d8 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -97,7 +97,7 @@ GLOBAL_SERVER_ARGS_KEYS = [ "ep_num_redundant_experts", "enable_nan_detection", "flashinfer_mla_disable_ragged", - "max_micro_batch_size", + "pp_max_micro_batch_size", "disable_shared_experts_fusion", "sampling_backend", "speculative_accept_threshold_single", diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index e1d558c1b..e5d6fab7b 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -464,8 +464,8 @@ class Scheduler( _, _, ) = self.tp_worker.get_worker_info() - if global_server_args_dict["max_micro_batch_size"] is None: - global_server_args_dict["max_micro_batch_size"] = max( + if global_server_args_dict["pp_max_micro_batch_size"] is None: + global_server_args_dict["pp_max_micro_batch_size"] = max( self.max_running_requests // server_args.pp_size, 1 ) @@ -1802,7 +1802,7 @@ class Scheduler( return ret def get_num_allocatable_reqs(self, running_bs): - res = global_server_args_dict["max_micro_batch_size"] - running_bs + res = global_server_args_dict["pp_max_micro_batch_size"] - running_bs if self.pp_size > 1: res = min(res, self.req_to_token_pool.available_size()) return res @@ -2510,7 +2510,7 @@ class Scheduler( server_args_dict = recv_req.server_args args_allow_update = set( [ - "max_micro_batch_size", + "pp_max_micro_batch_size", "speculative_accept_threshold_single", "speculative_accept_threshold_acc", ] @@ -2521,7 +2521,7 @@ class Scheduler( logging.warning(f"Updating {k} is not supported.") if_success = False break - elif k == "max_micro_batch_size" and ( + elif k == "pp_max_micro_batch_size" and ( v > self.max_running_requests // self.pp_size or v < 1 ): logging.warning( diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 90b7ad536..63a98b520 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -205,7 +205,7 @@ class ServerArgs: device: Optional[str] = None tp_size: int = 1 pp_size: int = 1 - max_micro_batch_size: Optional[int] = None + pp_max_micro_batch_size: Optional[int] = None stream_interval: int = 1 stream_output: bool = False random_seed: Optional[int] = None @@ -1599,9 +1599,9 @@ class ServerArgs: help="The pipeline parallelism size.", ) parser.add_argument( - "--max-micro-batch-size", + "--pp-max-micro-batch-size", type=int, - default=ServerArgs.max_micro_batch_size, + default=ServerArgs.pp_max_micro_batch_size, help="The maximum micro batch size in pipeline parallelism.", ) parser.add_argument(