From efbc116a0f81e7c3f09f45b0720152aa5b91dc0d Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 3 Nov 2024 01:45:20 -0700 Subject: [PATCH] Do not use longest prefix matching when #queue-req is large (#1896) --- python/sglang/srt/managers/schedule_policy.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py index 6ea6ff194..2bfdffc42 100644 --- a/python/sglang/srt/managers/schedule_policy.py +++ b/python/sglang/srt/managers/schedule_policy.py @@ -45,9 +45,15 @@ class SchedulePolicy: self.tree_cache = tree_cache def calc_priority(self, waiting_queue: List[Req]): + if len(waiting_queue) > 128 and self.policy == "lpm": + # Turn off the expensive prefix matching and sorting when the #queue is large. + policy = "fcfs" + else: + policy = self.policy + # Compute matched prefix length prefix_computed = False - if self.policy == "lpm" or self.policy == "dfs-weight": + if policy == "lpm" or policy == "dfs-weight": for r in waiting_queue: # NOTE: the prefix_indices must always be aligned with last_node r.prefix_indices, r.last_node = self.tree_cache.match_prefix( @@ -56,18 +62,18 @@ class SchedulePolicy: prefix_computed = True - if self.policy == "lpm": + if policy == "lpm": # Longest Prefix Match waiting_queue.sort(key=lambda x: -len(x.prefix_indices)) - elif self.policy == "fcfs": + elif policy == "fcfs": # first come first serve pass - elif self.policy == "lof": + elif policy == "lof": # longest output first waiting_queue.sort(key=lambda x: -x.sampling_params.max_new_tokens) - elif self.policy == "random": + elif policy == "random": random.shuffle(waiting_queue) - elif self.policy == "dfs-weight": + elif policy == "dfs-weight": last_node_to_reqs = defaultdict(list) for req in waiting_queue: last_node_to_reqs[req.last_node].append(req) @@ -85,7 +91,7 @@ class SchedulePolicy: waiting_queue, ) else: - raise ValueError(f"Unknown schedule_policy: {self.policy}") + raise ValueError(f"Unknown schedule_policy: {policy=}") return prefix_computed