[Bugfix] Prevent PD server crash from invalid grammar (#8062)
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
This commit is contained in:
@@ -1,10 +1,12 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
from http import HTTPStatus
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from sglang.srt.disaggregation.utils import prepare_abort
|
||||||
from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardMode
|
from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardMode
|
||||||
from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
|
from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
|
||||||
|
|
||||||
@@ -102,7 +104,17 @@ class ScheduleBatchDisaggregationDecodeMixin:
|
|||||||
self.output_ids.append(req.output_ids[-1])
|
self.output_ids.append(req.output_ids[-1])
|
||||||
self.tree_cache.cache_unfinished_req(req)
|
self.tree_cache.cache_unfinished_req(req)
|
||||||
if req.grammar is not None:
|
if req.grammar is not None:
|
||||||
req.grammar.accept_token(req.output_ids[-1])
|
# FIXME: this try-except block is for handling unexpected xgrammar issue.
|
||||||
|
try:
|
||||||
|
req.grammar.accept_token(req.output_ids[-1])
|
||||||
|
except ValueError as e:
|
||||||
|
# Grammar accept_token can raise ValueError if the token is not in the grammar.
|
||||||
|
# This can happen if the grammar is not set correctly or the token is invalid.
|
||||||
|
error_message = f"Grammar accept_token failed for req {req.rid} with token {req.output_ids[-1]}: {e}"
|
||||||
|
self.tree_cache.cache_finished_req(req)
|
||||||
|
prepare_abort(
|
||||||
|
req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR
|
||||||
|
)
|
||||||
req.grammar.finished = req.finished()
|
req.grammar.finished = req.finished()
|
||||||
self.output_ids = torch.tensor(self.output_ids, device=self.device)
|
self.output_ids = torch.tensor(self.output_ids, device=self.device)
|
||||||
|
|
||||||
|
|||||||
@@ -425,7 +425,19 @@ class SchedulerDisaggregationPrefillMixin:
|
|||||||
self.send_kv_chunk(req, last_chunk=True)
|
self.send_kv_chunk(req, last_chunk=True)
|
||||||
|
|
||||||
if req.grammar is not None:
|
if req.grammar is not None:
|
||||||
req.grammar.accept_token(next_token_id)
|
# FIXME: this try-except block is for handling unexpected xgrammar issue.
|
||||||
|
try:
|
||||||
|
req.grammar.accept_token(next_token_id)
|
||||||
|
except ValueError as e:
|
||||||
|
# Grammar accept_token can raise ValueError if the token is not in the grammar.
|
||||||
|
# This can happen if the grammar is not set correctly or the token is invalid.
|
||||||
|
error_message = f"Grammar accept_token failed for req {req.rid} with token {next_token_id}: {e}"
|
||||||
|
self.tree_cache.cache_finished_req(req)
|
||||||
|
prepare_abort(
|
||||||
|
req,
|
||||||
|
error_message,
|
||||||
|
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
|
||||||
|
)
|
||||||
req.grammar.finished = req.finished()
|
req.grammar.finished = req.finished()
|
||||||
else:
|
else:
|
||||||
# being chunked reqs' prefill is not finished
|
# being chunked reqs' prefill is not finished
|
||||||
|
|||||||
Reference in New Issue
Block a user