Fix retract for page size > 1 (#4914)
This commit is contained in:
@@ -169,7 +169,9 @@ class BaseGrammarBackend(ABC):
|
||||
self.cache.clear()
|
||||
|
||||
|
||||
def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size):
|
||||
def create_grammar_backend(
|
||||
server_args: ServerArgs, tokenizer, vocab_size: int
|
||||
) -> Optional[BaseGrammarBackend]:
|
||||
if server_args.grammar_backend == "outlines":
|
||||
from sglang.srt.constrained.outlines_backend import OutlinesGrammarBackend
|
||||
|
||||
@@ -188,6 +190,8 @@ def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size):
|
||||
tokenizer=tokenizer,
|
||||
whitespace_pattern=server_args.constrained_json_whitespace_pattern,
|
||||
)
|
||||
elif server_args.grammar_backend == "none":
|
||||
return None
|
||||
else:
|
||||
raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")
|
||||
|
||||
|
||||
@@ -599,6 +599,7 @@ class Req:
|
||||
self.extend_logprob_start_len = 0
|
||||
self.is_chunked = 0
|
||||
self.req_pool_idx = None
|
||||
self.already_computed = 0
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
@@ -960,8 +961,6 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
||||
# If req.input_embeds is already a list, append its content directly
|
||||
input_embeds.extend(req.input_embeds) # Use extend to avoid nesting
|
||||
|
||||
if req.is_retracted:
|
||||
req.already_computed = 0
|
||||
req.cached_tokens += pre_len - req.already_computed
|
||||
req.already_computed = seq_len
|
||||
req.is_retracted = False
|
||||
@@ -1189,7 +1188,11 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
||||
self.req_to_token_pool.free(req.req_pool_idx)
|
||||
else:
|
||||
# TODO: apply more fine-grained retraction
|
||||
last_uncached_pos = len(req.prefix_indices)
|
||||
last_uncached_pos = (
|
||||
(len(req.prefix_indices) + server_args.page_size - 1)
|
||||
// server_args.page_size
|
||||
* server_args.page_size
|
||||
)
|
||||
token_indices = self.req_to_token_pool.req_to_token[
|
||||
req.req_pool_idx, last_uncached_pos : seq_lens_cpu[idx]
|
||||
]
|
||||
|
||||
@@ -33,7 +33,7 @@ class SchedulerMetricsCollector:
|
||||
|
||||
def __init__(self, labels: Dict[str, str]) -> None:
|
||||
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
|
||||
from prometheus_client import Gauge
|
||||
from prometheus_client import Gauge, Histogram
|
||||
|
||||
self.labels = labels
|
||||
self.last_log_time = time.time()
|
||||
@@ -139,10 +139,10 @@ class TokenizerMetricsCollector:
|
||||
labelnames=labels.keys(),
|
||||
buckets=[
|
||||
0.1,
|
||||
0.3,
|
||||
0.5,
|
||||
0.7,
|
||||
0.9,
|
||||
0.2,
|
||||
0.4,
|
||||
0.6,
|
||||
0.8,
|
||||
1,
|
||||
2,
|
||||
4,
|
||||
@@ -153,36 +153,9 @@ class TokenizerMetricsCollector:
|
||||
40,
|
||||
60,
|
||||
80,
|
||||
120,
|
||||
160,
|
||||
],
|
||||
)
|
||||
|
||||
self.histogram_time_per_output_token = Histogram(
|
||||
name="sglang:time_per_output_token_seconds",
|
||||
documentation="Histogram of time per output token in seconds.",
|
||||
labelnames=labels.keys(),
|
||||
buckets=[
|
||||
0.002,
|
||||
0.005,
|
||||
0.010,
|
||||
0.020,
|
||||
0.030,
|
||||
0.040,
|
||||
0.050,
|
||||
0.060,
|
||||
0.070,
|
||||
0.080,
|
||||
0.090,
|
||||
0.100,
|
||||
0.150,
|
||||
0.200,
|
||||
0.300,
|
||||
0.400,
|
||||
0.600,
|
||||
0.800,
|
||||
1.000,
|
||||
2.000,
|
||||
100,
|
||||
200,
|
||||
400,
|
||||
],
|
||||
)
|
||||
|
||||
@@ -202,17 +175,18 @@ class TokenizerMetricsCollector:
|
||||
0.030,
|
||||
0.035,
|
||||
0.040,
|
||||
0.050,
|
||||
0.075,
|
||||
0.060,
|
||||
0.080,
|
||||
0.100,
|
||||
0.150,
|
||||
0.200,
|
||||
0.300,
|
||||
0.400,
|
||||
0.500,
|
||||
0.750,
|
||||
0.600,
|
||||
0.800,
|
||||
1.000,
|
||||
2.000,
|
||||
4.000,
|
||||
6.000,
|
||||
8.000,
|
||||
],
|
||||
)
|
||||
|
||||
@@ -224,23 +198,22 @@ class TokenizerMetricsCollector:
|
||||
0.1,
|
||||
0.2,
|
||||
0.4,
|
||||
0.6,
|
||||
0.8,
|
||||
1,
|
||||
2,
|
||||
5,
|
||||
4,
|
||||
6,
|
||||
8,
|
||||
10,
|
||||
20,
|
||||
40,
|
||||
60,
|
||||
80,
|
||||
100,
|
||||
150,
|
||||
200,
|
||||
250,
|
||||
300,
|
||||
350,
|
||||
500,
|
||||
1000,
|
||||
400,
|
||||
800,
|
||||
],
|
||||
)
|
||||
|
||||
@@ -256,13 +229,10 @@ class TokenizerMetricsCollector:
|
||||
):
|
||||
self.prompt_tokens_total.labels(**self.labels).inc(prompt_tokens)
|
||||
self.generation_tokens_total.labels(**self.labels).inc(generation_tokens)
|
||||
self.cached_tokens_total.labels(**self.labels).inc(cached_tokens)
|
||||
if cached_tokens > 0:
|
||||
self.cached_tokens_total.labels(**self.labels).inc(cached_tokens)
|
||||
self.num_requests_total.labels(**self.labels).inc(1)
|
||||
self._log_histogram(self.histogram_e2e_request_latency, e2e_latency)
|
||||
if generation_tokens >= 1:
|
||||
self.histogram_time_per_output_token.labels(**self.labels).observe(
|
||||
e2e_latency / generation_tokens
|
||||
)
|
||||
|
||||
def observe_time_to_first_token(self, value: float):
|
||||
self.histogram_time_to_first_token.labels(**self.labels).observe(value)
|
||||
|
||||
@@ -128,7 +128,7 @@ class ServerArgs:
|
||||
# Kernel backend
|
||||
attention_backend: Optional[str] = None
|
||||
sampling_backend: Optional[str] = None
|
||||
grammar_backend: Optional[str] = "xgrammar"
|
||||
grammar_backend: Optional[str] = None
|
||||
|
||||
# Speculative decoding
|
||||
speculative_algorithm: Optional[str] = None
|
||||
@@ -193,6 +193,13 @@ class ServerArgs:
|
||||
disaggregation_bootstrap_port: int = 8998
|
||||
|
||||
def __post_init__(self):
|
||||
# Expert parallelism
|
||||
if self.enable_ep_moe:
|
||||
self.ep_size = self.tp_size
|
||||
logger.info(
|
||||
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
||||
)
|
||||
|
||||
# Set missing default values
|
||||
if self.tokenizer_path is None:
|
||||
self.tokenizer_path = self.model_path
|
||||
@@ -274,12 +281,9 @@ class ServerArgs:
|
||||
)
|
||||
self.disable_cuda_graph = True
|
||||
|
||||
# Expert parallelism
|
||||
if self.enable_ep_moe:
|
||||
self.ep_size = self.tp_size
|
||||
logger.info(
|
||||
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
||||
)
|
||||
# Choose grammar backend
|
||||
if self.grammar_backend is None:
|
||||
self.grammar_backend = "xgrammar"
|
||||
|
||||
# Data parallelism attention
|
||||
if self.enable_dp_attention:
|
||||
@@ -813,7 +817,7 @@ class ServerArgs:
|
||||
parser.add_argument(
|
||||
"--grammar-backend",
|
||||
type=str,
|
||||
choices=["xgrammar", "outlines", "llguidance"],
|
||||
choices=["xgrammar", "outlines", "llguidance", "none"],
|
||||
default=ServerArgs.grammar_backend,
|
||||
help="Choose the backend for grammar-guided decoding.",
|
||||
)
|
||||
|
||||
@@ -1012,9 +1012,6 @@ def run_logprob_check(self: unittest.TestCase, arg: Tuple):
|
||||
|
||||
|
||||
class CustomTestCase(unittest.TestCase):
|
||||
pass
|
||||
|
||||
"""
|
||||
def _callTestMethod(self, method):
|
||||
max_retry = int(
|
||||
os.environ.get("SGLANG_TEST_MAX_RETRY", "2" if is_in_ci() else "0")
|
||||
@@ -1023,4 +1020,3 @@ class CustomTestCase(unittest.TestCase):
|
||||
lambda: super(CustomTestCase, self)._callTestMethod(method),
|
||||
max_retry=max_retry,
|
||||
)
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user