Fix memory leak when aborting decode request in PD-Disagg (#9817)
Co-authored-by: Lianmin Zheng <15100009+merrymercy@users.noreply.github.com>
This commit is contained in:
@@ -6,7 +6,6 @@ from sglang.srt.disaggregation.mini_lb import PrefillConfig, run
|
||||
|
||||
@dataclasses.dataclass
|
||||
class LBArgs:
|
||||
rust_lb: bool = False
|
||||
host: str = "0.0.0.0"
|
||||
port: int = 8000
|
||||
policy: str = "random"
|
||||
@@ -17,11 +16,6 @@ class LBArgs:
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: argparse.ArgumentParser):
|
||||
parser.add_argument(
|
||||
"--rust-lb",
|
||||
action="store_true",
|
||||
help="Deprecated, please use SGLang Router instead, this argument will have no effect.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--host",
|
||||
type=str,
|
||||
@@ -92,7 +86,6 @@ class LBArgs:
|
||||
]
|
||||
|
||||
return cls(
|
||||
rust_lb=args.rust_lb,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
policy=args.policy,
|
||||
@@ -102,12 +95,6 @@ class LBArgs:
|
||||
timeout=args.timeout,
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
if not self.rust_lb:
|
||||
assert (
|
||||
self.policy == "random"
|
||||
), "Only random policy is supported for Python load balancer"
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
|
||||
@@ -2378,6 +2378,10 @@ class Scheduler(
|
||||
# We still need to send something back to TokenizerManager to clean up the state.
|
||||
req = self.waiting_queue.pop(i)
|
||||
self.send_to_tokenizer.send_pyobj(AbortReq(req.rid))
|
||||
# For disaggregation decode mode, the request in the waiting queue has KV cache allocated.
|
||||
if self.disaggregation_mode == DisaggregationMode.DECODE:
|
||||
self.tree_cache.cache_finished_req(req)
|
||||
|
||||
logger.debug(f"Abort queued request. {req.rid=}")
|
||||
|
||||
# Delete the requests in the grammar queue
|
||||
|
||||
Reference in New Issue
Block a user