Expose more arguments to control the scheduling policy (#32)
This commit is contained in:
@@ -4,9 +4,9 @@ from typing import Callable, List, Optional, Union
|
||||
|
||||
from sglang.backend.anthropic import Anthropic
|
||||
from sglang.backend.base_backend import BaseBackend
|
||||
from sglang.backend.vertexai import VertexAI
|
||||
from sglang.backend.openai import OpenAI
|
||||
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
||||
from sglang.backend.vertexai import VertexAI
|
||||
from sglang.global_config import global_config
|
||||
from sglang.lang.ir import (
|
||||
SglExpr,
|
||||
|
||||
@@ -54,7 +54,9 @@ class SglSamplingParams:
|
||||
|
||||
def to_vertexai_kwargs(self):
|
||||
if self.regex is not None:
|
||||
warnings.warn("Regular expression is not supported in the VertexAI backend.")
|
||||
warnings.warn(
|
||||
"Regular expression is not supported in the VertexAI backend."
|
||||
)
|
||||
return {
|
||||
"candidate_count": 1,
|
||||
"max_output_tokens": self.max_new_tokens,
|
||||
@@ -67,7 +69,9 @@ class SglSamplingParams:
|
||||
def to_anthropic_kwargs(self):
|
||||
# Anthropic does not support frequency_penalty or presence_penalty, so we drop it here
|
||||
if self.regex is not None:
|
||||
warnings.warn("Regular expression is not supported in the Anthropic backend.")
|
||||
warnings.warn(
|
||||
"Regular expression is not supported in the Anthropic backend."
|
||||
)
|
||||
return {
|
||||
"max_tokens_to_sample": self.max_new_tokens,
|
||||
"stop_sequences": self.stop,
|
||||
|
||||
@@ -45,6 +45,7 @@ class ModelRpcServer(rpyc.Service):
|
||||
self.tp_rank = tp_rank
|
||||
self.tp_size = server_args.tp_size
|
||||
self.schedule_heuristic = server_args.schedule_heuristic
|
||||
self.schedule_conservativeness = server_args.schedule_conservativeness
|
||||
|
||||
# Init model and tokenizer
|
||||
self.model_config = ModelConfig(
|
||||
@@ -248,7 +249,9 @@ class ModelRpcServer(rpyc.Service):
|
||||
available_size = (
|
||||
self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
|
||||
)
|
||||
new_ratio = self.scheduler.new_token_estimation_ratio()
|
||||
new_ratio = (
|
||||
self.scheduler.new_token_estimation_ratio() * self.schedule_conservativeness
|
||||
)
|
||||
if self.running_batch:
|
||||
available_size -= sum(
|
||||
[
|
||||
|
||||
@@ -16,6 +16,7 @@ class ServerArgs:
|
||||
tp_size: int = 1
|
||||
model_mode: List[str] = ()
|
||||
schedule_heuristic: str = "lpm"
|
||||
schedule_conservativeness: float = 1.0
|
||||
random_seed: int = 42
|
||||
stream_interval: int = 2
|
||||
disable_log_stats: bool = False
|
||||
@@ -85,7 +86,7 @@ class ServerArgs:
|
||||
"--mem-fraction-static",
|
||||
type=float,
|
||||
default=ServerArgs.mem_fraction_static,
|
||||
help="The fraction of the memory used for static allocation (model weights and KV cache memory pool)",
|
||||
help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tp-size",
|
||||
@@ -107,6 +108,12 @@ class ServerArgs:
|
||||
default=ServerArgs.schedule_heuristic,
|
||||
help="Schudule mode: [lpm, weight, random, fcfs]",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--schedule-conservativeness",
|
||||
type=float,
|
||||
default=ServerArgs.schedule_conservativeness,
|
||||
help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see out-of-memory errors.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--random-seed",
|
||||
type=int,
|
||||
|
||||
Reference in New Issue
Block a user