Rename max_micro_batch_size -> pp_max_micro_batch_size (#11279)

2025-10-06 15:50:56 -07:00
parent e2daeb351c
commit 708f4ff490
5 changed files with 11 additions and 11 deletions
--- a/python/sglang/srt/entrypoints/http_server.py
+++ b/python/sglang/srt/entrypoints/http_server.py
@@ -494,7 +494,7 @@ async def get_load():


 # example usage:
-# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"max_micro_batch_size": 8}}'
+# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"pp_max_micro_batch_size": 8}}'
@app.api_route("/set_internal_state", methods=["POST", "PUT"])
 async def set_internal_state(obj: SetInternalStateReq, request: Request):
    res = await _global_state.tokenizer_manager.set_internal_state(obj)
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -97,7 +97,7 @@ GLOBAL_SERVER_ARGS_KEYS = [
    "ep_num_redundant_experts",
    "enable_nan_detection",
    "flashinfer_mla_disable_ragged",
-    "max_micro_batch_size",
+    "pp_max_micro_batch_size",
    "disable_shared_experts_fusion",
    "sampling_backend",
    "speculative_accept_threshold_single",
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -464,8 +464,8 @@ class Scheduler(
            _,
            _,
        ) = self.tp_worker.get_worker_info()
-        if global_server_args_dict["max_micro_batch_size"] is None:
-            global_server_args_dict["max_micro_batch_size"] = max(
+        if global_server_args_dict["pp_max_micro_batch_size"] is None:
+            global_server_args_dict["pp_max_micro_batch_size"] = max(
                self.max_running_requests // server_args.pp_size, 1
            )

@@ -1802,7 +1802,7 @@ class Scheduler(
        return ret

    def get_num_allocatable_reqs(self, running_bs):
-        res = global_server_args_dict["max_micro_batch_size"] - running_bs
+        res = global_server_args_dict["pp_max_micro_batch_size"] - running_bs
        if self.pp_size > 1:
            res = min(res, self.req_to_token_pool.available_size())
        return res
@@ -2510,7 +2510,7 @@ class Scheduler(
        server_args_dict = recv_req.server_args
        args_allow_update = set(
            [
-                "max_micro_batch_size",
+                "pp_max_micro_batch_size",
                "speculative_accept_threshold_single",
                "speculative_accept_threshold_acc",
            ]
@@ -2521,7 +2521,7 @@ class Scheduler(
                logging.warning(f"Updating {k} is not supported.")
                if_success = False
                break
-            elif k == "max_micro_batch_size" and (
+            elif k == "pp_max_micro_batch_size" and (
                v > self.max_running_requests // self.pp_size or v < 1
            ):
                logging.warning(
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -205,7 +205,7 @@ class ServerArgs:
    device: Optional[str] = None
    tp_size: int = 1
    pp_size: int = 1
-    max_micro_batch_size: Optional[int] = None
+    pp_max_micro_batch_size: Optional[int] = None
    stream_interval: int = 1
    stream_output: bool = False
    random_seed: Optional[int] = None
@@ -1599,9 +1599,9 @@ class ServerArgs:
            help="The pipeline parallelism size.",
        )
        parser.add_argument(
-            "--max-micro-batch-size",
+            "--pp-max-micro-batch-size",
            type=int,
-            default=ServerArgs.max_micro_batch_size,
+            default=ServerArgs.pp_max_micro_batch_size,
            help="The maximum micro batch size in pipeline parallelism.",
        )
        parser.add_argument(