diff --git a/python/sglang/api.py b/python/sglang/api.py index ed31ec141..208077941 100644 --- a/python/sglang/api.py +++ b/python/sglang/api.py @@ -4,9 +4,9 @@ from typing import Callable, List, Optional, Union from sglang.backend.anthropic import Anthropic from sglang.backend.base_backend import BaseBackend -from sglang.backend.vertexai import VertexAI from sglang.backend.openai import OpenAI from sglang.backend.runtime_endpoint import RuntimeEndpoint +from sglang.backend.vertexai import VertexAI from sglang.global_config import global_config from sglang.lang.ir import ( SglExpr, diff --git a/python/sglang/lang/ir.py b/python/sglang/lang/ir.py index b6c1b9b54..6202bffaf 100644 --- a/python/sglang/lang/ir.py +++ b/python/sglang/lang/ir.py @@ -54,7 +54,9 @@ class SglSamplingParams: def to_vertexai_kwargs(self): if self.regex is not None: - warnings.warn("Regular expression is not supported in the VertexAI backend.") + warnings.warn( + "Regular expression is not supported in the VertexAI backend." + ) return { "candidate_count": 1, "max_output_tokens": self.max_new_tokens, @@ -67,7 +69,9 @@ class SglSamplingParams: def to_anthropic_kwargs(self): # Anthropic does not support frequency_penalty or presence_penalty, so we drop it here if self.regex is not None: - warnings.warn("Regular expression is not supported in the Anthropic backend.") + warnings.warn( + "Regular expression is not supported in the Anthropic backend." + ) return { "max_tokens_to_sample": self.max_new_tokens, "stop_sequences": self.stop, diff --git a/python/sglang/srt/managers/router/model_rpc.py b/python/sglang/srt/managers/router/model_rpc.py index 877afd749..94613ce37 100644 --- a/python/sglang/srt/managers/router/model_rpc.py +++ b/python/sglang/srt/managers/router/model_rpc.py @@ -45,6 +45,7 @@ class ModelRpcServer(rpyc.Service): self.tp_rank = tp_rank self.tp_size = server_args.tp_size self.schedule_heuristic = server_args.schedule_heuristic + self.schedule_conservativeness = server_args.schedule_conservativeness # Init model and tokenizer self.model_config = ModelConfig( @@ -248,7 +249,9 @@ class ModelRpcServer(rpyc.Service): available_size = ( self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size() ) - new_ratio = self.scheduler.new_token_estimation_ratio() + new_ratio = ( + self.scheduler.new_token_estimation_ratio() * self.schedule_conservativeness + ) if self.running_batch: available_size -= sum( [ diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 3bf4cb991..e0d1c236d 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -16,6 +16,7 @@ class ServerArgs: tp_size: int = 1 model_mode: List[str] = () schedule_heuristic: str = "lpm" + schedule_conservativeness: float = 1.0 random_seed: int = 42 stream_interval: int = 2 disable_log_stats: bool = False @@ -85,7 +86,7 @@ class ServerArgs: "--mem-fraction-static", type=float, default=ServerArgs.mem_fraction_static, - help="The fraction of the memory used for static allocation (model weights and KV cache memory pool)", + help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.", ) parser.add_argument( "--tp-size", @@ -107,6 +108,12 @@ class ServerArgs: default=ServerArgs.schedule_heuristic, help="Schudule mode: [lpm, weight, random, fcfs]", ) + parser.add_argument( + "--schedule-conservativeness", + type=float, + default=ServerArgs.schedule_conservativeness, + help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see out-of-memory errors.", + ) parser.add_argument( "--random-seed", type=int,