feat: throttle requests at scheduler based on --max_queued_requests (#7565)
This commit is contained in:
@@ -24,6 +24,7 @@ import time
|
||||
from collections import defaultdict, deque
|
||||
from concurrent import futures
|
||||
from dataclasses import dataclass
|
||||
from http import HTTPStatus
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
@@ -370,6 +371,7 @@ class Scheduler(
|
||||
self.max_total_num_tokens,
|
||||
self.max_prefill_tokens,
|
||||
self.max_running_requests,
|
||||
self.max_queued_requests,
|
||||
self.max_req_len,
|
||||
self.max_req_input_len,
|
||||
self.random_seed,
|
||||
@@ -1086,6 +1088,19 @@ class Scheduler(
|
||||
self.return_health_check_ct += 1
|
||||
continue
|
||||
|
||||
# If it is a work request, accept or reject the request based on the request queue size.
|
||||
if is_work_request(recv_req):
|
||||
if len(self.waiting_queue) + 1 > self.max_queued_requests:
|
||||
abort_req = AbortReq(
|
||||
recv_req.rid,
|
||||
finished_reason={
|
||||
"type": "abort",
|
||||
"status_code": HTTPStatus.SERVICE_UNAVAILABLE,
|
||||
"message": "The request queue is full.",
|
||||
},
|
||||
)
|
||||
self.send_to_tokenizer.send_pyobj(abort_req)
|
||||
continue
|
||||
output = self._request_dispatcher(recv_req)
|
||||
if output is not None:
|
||||
if isinstance(output, RpcReqOutput):
|
||||
@@ -2902,6 +2917,10 @@ def is_health_check_generate_req(recv_req):
|
||||
return getattr(recv_req, "rid", "").startswith("HEALTH_CHECK")
|
||||
|
||||
|
||||
def is_work_request(recv_req):
|
||||
return isinstance(recv_req, (TokenizedGenerateReqInput, TokenizedEmbeddingReqInput))
|
||||
|
||||
|
||||
def _export_static_state(model):
|
||||
return dict(
|
||||
buffers=[
|
||||
|
||||
Reference in New Issue
Block a user