From f8ca2368b20d2f7eb378dce7f2e0056beb144c4b Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Fri, 25 Jul 2025 15:44:01 -0700 Subject: [PATCH] fix: kimi k2 xgrammar crash (#8367) Co-authored-by: cicirori <32845984+cicirori@users.noreply.github.com> Co-authored-by: gongwei-130 <56567052+gongwei-130@users.noreply.github.com> --- python/sglang/srt/managers/schedule_batch.py | 10 ++++++++++ python/sglang/srt/managers/scheduler.py | 7 +++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index ea7cad98b..ad8bcf119 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -431,6 +431,7 @@ class Req: bootstrap_port: Optional[int] = None, bootstrap_room: Optional[int] = None, data_parallel_rank: Optional[int] = None, + vocab_size: Optional[int] = None, ): # Input and output info self.rid = rid @@ -480,6 +481,7 @@ class Req: self.to_abort_message: str = None self.stream = stream self.eos_token_ids = eos_token_ids + self.vocab_size = vocab_size # For incremental decoding # ----- | --------- read_ids -------| @@ -713,6 +715,14 @@ class Req: self.finished_reason = FINISH_MATCHED_TOKEN(matched=last_token_id) return + if last_token_id > self.vocab_size or last_token_id < 0: + if self.sampling_params.stop_token_ids: + self.output_ids[-1] = next(iter(self.sampling_params.stop_token_ids)) + if self.eos_token_ids: + self.output_ids[-1] = next(iter(self.eos_token_ids)) + self.finished_reason = FINISH_MATCHED_STR(matched="NaN happened") + return + # Check stop strings if len(self.sampling_params.stop_strs) > 0: tail_str = self.tokenizer.decode( diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 9b7a8b7d9..f3eb20cad 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -1129,6 +1129,7 @@ class Scheduler( bootstrap_port=recv_req.bootstrap_port, bootstrap_room=recv_req.bootstrap_room, data_parallel_rank=recv_req.data_parallel_rank, + vocab_size=self.model_config.vocab_size, ) req.tokenizer = self.tokenizer @@ -1395,8 +1396,10 @@ class Scheduler( logger.info(f) if self.enable_metrics: - cache_hit_rate = adder.log_hit_tokens / ( - adder.log_input_tokens + adder.log_hit_tokens + total_tokens = adder.log_input_tokens + adder.log_hit_tokens + + cache_hit_rate = ( + adder.log_hit_tokens / total_tokens if total_tokens > 0 else 0.0 ) self.stats.num_running_reqs = running_bs self.stats.num_used_tokens = num_used