Fix memory leak when aborting decode request in PD-Disagg (#9817)
Co-authored-by: Lianmin Zheng <15100009+merrymercy@users.noreply.github.com>
This commit is contained in:
@@ -6,7 +6,6 @@ from sglang.srt.disaggregation.mini_lb import PrefillConfig, run
|
|||||||
|
|
||||||
@dataclasses.dataclass
|
@dataclasses.dataclass
|
||||||
class LBArgs:
|
class LBArgs:
|
||||||
rust_lb: bool = False
|
|
||||||
host: str = "0.0.0.0"
|
host: str = "0.0.0.0"
|
||||||
port: int = 8000
|
port: int = 8000
|
||||||
policy: str = "random"
|
policy: str = "random"
|
||||||
@@ -17,11 +16,6 @@ class LBArgs:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def add_cli_args(parser: argparse.ArgumentParser):
|
def add_cli_args(parser: argparse.ArgumentParser):
|
||||||
parser.add_argument(
|
|
||||||
"--rust-lb",
|
|
||||||
action="store_true",
|
|
||||||
help="Deprecated, please use SGLang Router instead, this argument will have no effect.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--host",
|
"--host",
|
||||||
type=str,
|
type=str,
|
||||||
@@ -92,7 +86,6 @@ class LBArgs:
|
|||||||
]
|
]
|
||||||
|
|
||||||
return cls(
|
return cls(
|
||||||
rust_lb=args.rust_lb,
|
|
||||||
host=args.host,
|
host=args.host,
|
||||||
port=args.port,
|
port=args.port,
|
||||||
policy=args.policy,
|
policy=args.policy,
|
||||||
@@ -102,12 +95,6 @@ class LBArgs:
|
|||||||
timeout=args.timeout,
|
timeout=args.timeout,
|
||||||
)
|
)
|
||||||
|
|
||||||
def __post_init__(self):
|
|
||||||
if not self.rust_lb:
|
|
||||||
assert (
|
|
||||||
self.policy == "random"
|
|
||||||
), "Only random policy is supported for Python load balancer"
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
|
|||||||
@@ -2378,6 +2378,10 @@ class Scheduler(
|
|||||||
# We still need to send something back to TokenizerManager to clean up the state.
|
# We still need to send something back to TokenizerManager to clean up the state.
|
||||||
req = self.waiting_queue.pop(i)
|
req = self.waiting_queue.pop(i)
|
||||||
self.send_to_tokenizer.send_pyobj(AbortReq(req.rid))
|
self.send_to_tokenizer.send_pyobj(AbortReq(req.rid))
|
||||||
|
# For disaggregation decode mode, the request in the waiting queue has KV cache allocated.
|
||||||
|
if self.disaggregation_mode == DisaggregationMode.DECODE:
|
||||||
|
self.tree_cache.cache_finished_req(req)
|
||||||
|
|
||||||
logger.debug(f"Abort queued request. {req.rid=}")
|
logger.debug(f"Abort queued request. {req.rid=}")
|
||||||
|
|
||||||
# Delete the requests in the grammar queue
|
# Delete the requests in the grammar queue
|
||||||
|
|||||||
Reference in New Issue
Block a user