Rename max_micro_batch_size -> pp_max_micro_batch_size (#11279)
This commit is contained in:
@@ -494,7 +494,7 @@ async def get_load():
|
||||
|
||||
|
||||
# example usage:
|
||||
# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"max_micro_batch_size": 8}}'
|
||||
# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"pp_max_micro_batch_size": 8}}'
|
||||
@app.api_route("/set_internal_state", methods=["POST", "PUT"])
|
||||
async def set_internal_state(obj: SetInternalStateReq, request: Request):
|
||||
res = await _global_state.tokenizer_manager.set_internal_state(obj)
|
||||
|
||||
@@ -97,7 +97,7 @@ GLOBAL_SERVER_ARGS_KEYS = [
|
||||
"ep_num_redundant_experts",
|
||||
"enable_nan_detection",
|
||||
"flashinfer_mla_disable_ragged",
|
||||
"max_micro_batch_size",
|
||||
"pp_max_micro_batch_size",
|
||||
"disable_shared_experts_fusion",
|
||||
"sampling_backend",
|
||||
"speculative_accept_threshold_single",
|
||||
|
||||
@@ -464,8 +464,8 @@ class Scheduler(
|
||||
_,
|
||||
_,
|
||||
) = self.tp_worker.get_worker_info()
|
||||
if global_server_args_dict["max_micro_batch_size"] is None:
|
||||
global_server_args_dict["max_micro_batch_size"] = max(
|
||||
if global_server_args_dict["pp_max_micro_batch_size"] is None:
|
||||
global_server_args_dict["pp_max_micro_batch_size"] = max(
|
||||
self.max_running_requests // server_args.pp_size, 1
|
||||
)
|
||||
|
||||
@@ -1802,7 +1802,7 @@ class Scheduler(
|
||||
return ret
|
||||
|
||||
def get_num_allocatable_reqs(self, running_bs):
|
||||
res = global_server_args_dict["max_micro_batch_size"] - running_bs
|
||||
res = global_server_args_dict["pp_max_micro_batch_size"] - running_bs
|
||||
if self.pp_size > 1:
|
||||
res = min(res, self.req_to_token_pool.available_size())
|
||||
return res
|
||||
@@ -2510,7 +2510,7 @@ class Scheduler(
|
||||
server_args_dict = recv_req.server_args
|
||||
args_allow_update = set(
|
||||
[
|
||||
"max_micro_batch_size",
|
||||
"pp_max_micro_batch_size",
|
||||
"speculative_accept_threshold_single",
|
||||
"speculative_accept_threshold_acc",
|
||||
]
|
||||
@@ -2521,7 +2521,7 @@ class Scheduler(
|
||||
logging.warning(f"Updating {k} is not supported.")
|
||||
if_success = False
|
||||
break
|
||||
elif k == "max_micro_batch_size" and (
|
||||
elif k == "pp_max_micro_batch_size" and (
|
||||
v > self.max_running_requests // self.pp_size or v < 1
|
||||
):
|
||||
logging.warning(
|
||||
|
||||
@@ -205,7 +205,7 @@ class ServerArgs:
|
||||
device: Optional[str] = None
|
||||
tp_size: int = 1
|
||||
pp_size: int = 1
|
||||
max_micro_batch_size: Optional[int] = None
|
||||
pp_max_micro_batch_size: Optional[int] = None
|
||||
stream_interval: int = 1
|
||||
stream_output: bool = False
|
||||
random_seed: Optional[int] = None
|
||||
@@ -1599,9 +1599,9 @@ class ServerArgs:
|
||||
help="The pipeline parallelism size.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-micro-batch-size",
|
||||
"--pp-max-micro-batch-size",
|
||||
type=int,
|
||||
default=ServerArgs.max_micro_batch_size,
|
||||
default=ServerArgs.pp_max_micro_batch_size,
|
||||
help="The maximum micro batch size in pipeline parallelism.",
|
||||
)
|
||||
parser.add_argument(
|
||||
|
||||
Reference in New Issue
Block a user