sglang/python/sglang/srt/managers/detokenizer_manager.py

# Copyright 2023-2024 SGLang Team
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""DetokenizerManager is a process that detokenizes the token ids."""

import dataclasses
import logging
import os
import signal
from collections import OrderedDict
from typing import Dict, List, Union

import psutil
import setproctitle
import zmq

from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.managers.io_struct import (
    BatchEmbeddingOut,
    BatchMultimodalDecodeReq,
    BatchMultimodalOut,
    BatchStrOut,
    BatchTokenIDOut,
    FreezeGCReq,
    MultiTokenizerRegisterReq,
)
from sglang.srt.managers.multi_tokenizer_mixin import MultiHttpWorkerDetokenizerMixin
from sglang.srt.server_args import PortArgs, ServerArgs
from sglang.srt.utils import (
    configure_logger,
    freeze_gc,
    get_zmq_socket,
    kill_itself_when_parent_died,
)
from sglang.utils import (
    TypeBasedDispatcher,
    find_printable_text,
    get_exception_traceback,
)

logger = logging.getLogger(__name__)

# Maximum number of request states that detokenizer can hold. When exceeded,
# oldest request states will be evicted. Default: 65536 (1<<16).
# For more details, see: https://github.com/sgl-project/sglang/issues/2812
# Use power of 2 values for better memory allocation.
DETOKENIZER_MAX_STATES = int(os.environ.get("SGLANG_DETOKENIZER_MAX_STATES", 1 << 16))


@dataclasses.dataclass
class DecodeStatus:
    """Store the status of incremental decoding."""

    decoded_text: str
    decode_ids: List[int]
    surr_offset: int
    read_offset: int
    # Offset that's sent to tokenizer for incremental update.
    sent_offset: int = 0


class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
    """DetokenizerManager is a process that detokenizes the token ids."""

    def __init__(
        self,
        server_args: ServerArgs,
        port_args: PortArgs,
    ):
        # Init inter-process communication
        context = zmq.Context(2)
        self.recv_from_scheduler = get_zmq_socket(
            context, zmq.PULL, port_args.detokenizer_ipc_name, True
        )
        self.send_to_tokenizer = get_zmq_socket(
            context, zmq.PUSH, port_args.tokenizer_ipc_name, False
        )

        if server_args.skip_tokenizer_init:
            self.tokenizer = None
        else:
            self.tokenizer = get_tokenizer(
                server_args.tokenizer_path,
                tokenizer_mode=server_args.tokenizer_mode,
                trust_remote_code=server_args.trust_remote_code,
                revision=server_args.revision,
            )

        self.decode_status = LimitedCapacityDict(capacity=DETOKENIZER_MAX_STATES)
        self.is_dummy = server_args.load_format == "dummy"

        self._request_dispatcher = TypeBasedDispatcher(
            [
                (BatchEmbeddingOut, self.handle_batch_embedding_out),
                (BatchTokenIDOut, self.handle_batch_token_id_out),
                (BatchMultimodalDecodeReq, self.handle_multimodal_decode_req),
                (MultiTokenizerRegisterReq, lambda x: x),
                (FreezeGCReq, self.handle_freeze_gc_req),
            ]
        )

        self.is_tool_call_parser_gpt_oss = server_args.tool_call_parser == "gpt-oss"

    def event_loop(self):
        """The event loop that handles requests"""
        while True:
            recv_obj = self.recv_from_scheduler.recv_pyobj()
            output = self._request_dispatcher(recv_obj)
            if output is not None:
                self.send_to_tokenizer.send_pyobj(output)

    def trim_matched_stop(
        self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool
    ):
        if no_stop_trim or not finished_reason:
            return output

        matched = finished_reason.get("matched", None)
        if not matched:
            return output

        # TODO(lmzheng): handle the case where multiple stop strs are hit

        # Trim stop str.
        if isinstance(matched, str) and isinstance(output, str):
            pos = output.find(matched)
            return output[:pos] if pos != -1 else output

        # Trim stop token.
        if isinstance(matched, int) and isinstance(output, list):
            # 200012 <|call|> is the tool call token and one of eos tokens for gpt-oss model
            if output[-1] == 200012 and self.is_tool_call_parser_gpt_oss:
                return output
            assert len(output) > 0
            return output[:-1]
        return output

    def handle_batch_embedding_out(self, recv_obj: BatchEmbeddingOut):
        # If it is embedding model, no detokenization is needed.
        return recv_obj

    def handle_batch_token_id_out(self, recv_obj: BatchTokenIDOut):
        bs = len(recv_obj.rids)

        # Initialize decode status
        read_ids, surr_ids = [], []
        for i in range(bs):
            rid = recv_obj.rids[i]
            if rid not in self.decode_status:
                s = DecodeStatus(
                    decoded_text=recv_obj.decoded_texts[i],
                    decode_ids=recv_obj.decode_ids[i],
                    surr_offset=0,
                    read_offset=recv_obj.read_offsets[i],
                )
                self.decode_status[rid] = s
            else:
                s = self.decode_status[rid]
                s.decode_ids.extend(recv_obj.decode_ids[i])

            read_ids.append(
                self.trim_matched_stop(
                    s.decode_ids[s.surr_offset :],
                    recv_obj.finished_reasons[i],
                    recv_obj.no_stop_trim[i],
                )
            )
            surr_ids.append(s.decode_ids[s.surr_offset : s.read_offset])

        # TODO(lmzheng): handle skip_special_tokens/spaces_between_special_tokens per request
        surr_texts = self.tokenizer.batch_decode(
            surr_ids,
            skip_special_tokens=recv_obj.skip_special_tokens[0],
            spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
        )
        read_texts = self.tokenizer.batch_decode(
            read_ids,
            skip_special_tokens=recv_obj.skip_special_tokens[0],
            spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
        )

        # Incremental decoding
        output_strs = []
        for i in range(bs):
            try:
                s = self.decode_status[recv_obj.rids[i]]
            except KeyError:
                raise RuntimeError(
                    f"Decode status not found for request {recv_obj.rids[i]}. "
                    "It may be due to the request being evicted from the decode status due to memory pressure. "
                    "Please increase the maximum number of requests by setting "
                    "the SGLANG_DETOKENIZER_MAX_STATES environment variable to a bigger value than the default value. "
                    f"The current value is {DETOKENIZER_MAX_STATES}. "
                    "For more details, see: https://github.com/sgl-project/sglang/issues/2812"
                )
            new_text = read_texts[i][len(surr_texts[i]) :]
            if recv_obj.finished_reasons[i] is None:
                # Streaming chunk: update the decode status
                if len(new_text) > 0 and not new_text.endswith("<EFBFBD>"):
                    s.decoded_text = s.decoded_text + new_text
                    s.surr_offset = s.read_offset
                    s.read_offset = len(s.decode_ids)
                    new_text = ""
                else:
                    new_text = find_printable_text(new_text)

            output_str = self.trim_matched_stop(
                s.decoded_text + new_text,
                recv_obj.finished_reasons[i],
                recv_obj.no_stop_trim[i],
            )
            # Incrementally send text.
            incremental_output = output_str[s.sent_offset :]
            s.sent_offset = len(output_str)
            output_strs.append(incremental_output)

        return BatchStrOut(
            rids=recv_obj.rids,
            finished_reasons=recv_obj.finished_reasons,
            output_strs=output_strs,
            output_ids=recv_obj.decode_ids,
            prompt_tokens=recv_obj.prompt_tokens,
            completion_tokens=recv_obj.completion_tokens,
            cached_tokens=recv_obj.cached_tokens,
            spec_verify_ct=recv_obj.spec_verify_ct,
            input_token_logprobs_val=recv_obj.input_token_logprobs_val,
            input_token_logprobs_idx=recv_obj.input_token_logprobs_idx,
            output_token_logprobs_val=recv_obj.output_token_logprobs_val,
            output_token_logprobs_idx=recv_obj.output_token_logprobs_idx,
            input_top_logprobs_val=recv_obj.input_top_logprobs_val,
            input_top_logprobs_idx=recv_obj.input_top_logprobs_idx,
            output_top_logprobs_val=recv_obj.output_top_logprobs_val,
            output_top_logprobs_idx=recv_obj.output_top_logprobs_idx,
            input_token_ids_logprobs_val=recv_obj.input_token_ids_logprobs_val,
            input_token_ids_logprobs_idx=recv_obj.input_token_ids_logprobs_idx,
            output_token_ids_logprobs_val=recv_obj.output_token_ids_logprobs_val,
            output_token_ids_logprobs_idx=recv_obj.output_token_ids_logprobs_idx,
            output_hidden_states=recv_obj.output_hidden_states,
            placeholder_tokens_idx=None,
            placeholder_tokens_val=None,
        )

    def handle_multimodal_decode_req(self, recv_obj: BatchMultimodalDecodeReq):
        outputs = self.tokenizer.detokenize(recv_obj)
        return BatchMultimodalOut(
            rids=recv_obj.rids,
            finished_reasons=recv_obj.finished_reasons,
            outputs=outputs,
            prompt_tokens=recv_obj.prompt_tokens,
            completion_tokens=recv_obj.completion_tokens,
            cached_tokens=recv_obj.cached_tokens,
            placeholder_tokens_idx=None,
            placeholder_tokens_val=None,
        )

    def handle_freeze_gc_req(self, recv_req: FreezeGCReq):
        freeze_gc("Detokenizer Manager")
        return None


class LimitedCapacityDict(OrderedDict):
    def __init__(self, capacity: int, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.capacity = capacity

    def __setitem__(self, key, value):
        if len(self) >= self.capacity:
            # Remove the oldest element (first item in the dict)
            self.popitem(last=False)
        # Set the new item
        super().__setitem__(key, value)


def run_detokenizer_process(
    server_args: ServerArgs,
    port_args: PortArgs,
):
    kill_itself_when_parent_died()
    setproctitle.setproctitle("sglang::detokenizer")
    configure_logger(server_args)
    parent_process = psutil.Process().parent()

    try:
        manager = DetokenizerManager(server_args, port_args)
        if server_args.tokenizer_worker_num > 1:
            manager.multi_http_worker_event_loop()
        else:
            manager.event_loop()
    except Exception:
        manager.socket_mapping.clear_all_sockets()
        traceback = get_exception_traceback()
        logger.error(f"DetokenizerManager hit an exception: {traceback}")
        parent_process.send_signal(signal.SIGQUIT)
-												docs: fix module docstrings and copyright headers (#2077)


											
										
										
											2024-11-22 22:16:53 +08:00
+								# Copyright 2023-2024 SGLang Team
 								# Licensed under the Apache License, Version 2.0 (the "License");
 								# you may not use this file except in compliance with the License.
 								# You may obtain a copy of the License at
 								#
 								#     http://www.apache.org/licenses/LICENSE-2.0
 								#
 								# Unless required by applicable law or agreed to in writing, software
 								# distributed under the License is distributed on an "AS IS" BASIS,
 								# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								# See the License for the specific language governing permissions and
 								# limitations under the License.
 								# ==============================================================================
-												Improve doc strings (#518)

											
										
										
											2024-06-08 02:06:52 -07:00
+								"""DetokenizerManager is a process that detokenizes the token ids."""
-												Higher priority for user input of max_prefill_tokens & format (#540)


											
										
										
											2024-06-12 21:48:40 -07:00
-												Detokenize incrementally when streaming (#653)


											
										
										
											2024-07-18 17:57:40 -07:00
+								import dataclasses
-												Improve process creation (#1534)


											
										
										
											2024-09-29 02:36:12 -07:00
+								import logging
-												[#2812] Make the decode status dict capcity adjustable by a CLI param (#2839)


											
										
										
											2025-01-20 04:36:53 +09:00
+								import os
-												Crash the server correctly during error (#2231)


											
										
										
											2024-11-28 00:22:39 -08:00
+								import signal
-												Improve process creation (#1534)


											
										
										
											2024-09-29 02:36:12 -07:00
+								from collections import OrderedDict
-												Simplify stream_output (#2398)


											
										
										
											2024-12-08 12:27:13 -08:00
+								from typing import Dict, List, Union
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
-												Crash the server correctly during error (#2231)


											
										
										
											2024-11-28 00:22:39 -08:00
+								import psutil
-												nit: Remove busy waiting on scheduler (#2382)


											
										
										
											2024-12-08 01:06:15 -08:00
+								import setproctitle
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
+								import zmq
-												add `.isort.cfg` (#378)


											
										
										
											2024-04-22 22:38:09 +08:00
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
+								from sglang.srt.hf_transformers_utils import get_tokenizer
-												Add e5-mistral embedding model - step 3/3 (#988)


											
										
										
											2024-08-08 16:31:19 -07:00
+								from sglang.srt.managers.io_struct import (
 								    BatchEmbeddingOut,
-												Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
Co-authored-by: dhou-xai <dhou@x.ai>
Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>

											
										
										
											2025-03-03 00:12:04 -08:00
+								    BatchMultimodalDecodeReq,
-												Support incremental streaming of logprob/token_ids between scheduler and detokenizer (#6225)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
											
										
										
											2025-05-12 14:33:38 -07:00
+								    BatchMultimodalOut,
-												Add e5-mistral embedding model - step 3/3 (#988)


											
										
										
											2024-08-08 16:31:19 -07:00
+								    BatchStrOut,
 								    BatchTokenIDOut,
-												Support GC Freezing to improve latency & throughput (#9241)

Co-authored-by: Chanh Nguyen <cnguyen@linkedin.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
											
										
										
											2025-08-22 22:43:09 -07:00
+								    FreezeGCReq,
-												Support Multi Process Tokenizer Manager(#6555) (#8964)

Signed-off-by: ybyang <ybyang7@iflytek.com>
Signed-off-by: huanglong <huanglong@linux.alibaba.com>
Co-authored-by: Huang Long <121648372+LLLL114@users.noreply.github.com>
Co-authored-by: huanglong <huanglong@linux.alibaba.com>
Co-authored-by: Shangming Cai <csmthu@gmail.com>
											
										
										
											2025-09-01 16:00:13 +08:00
+								    MultiTokenizerRegisterReq,
-												Add e5-mistral embedding model - step 3/3 (#988)


											
										
										
											2024-08-08 16:31:19 -07:00
+								)
-												[1/2] Refactor multi-tokenizer manager (#10074)


											
										
										
											2025-09-07 19:13:34 +08:00
+								from sglang.srt.managers.multi_tokenizer_mixin import MultiHttpWorkerDetokenizerMixin
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
+								from sglang.srt.server_args import PortArgs, ServerArgs
-												Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
Co-authored-by: dhou-xai <dhou@x.ai>
Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>

											
										
										
											2025-03-03 00:12:04 -08:00
+								from sglang.srt.utils import (
 								    configure_logger,
-												Support GC Freezing to improve latency & throughput (#9241)

Co-authored-by: Chanh Nguyen <cnguyen@linkedin.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
											
										
										
											2025-08-22 22:43:09 -07:00
+								    freeze_gc,
-												Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
Co-authored-by: dhou-xai <dhou@x.ai>
Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>

											
										
										
											2025-03-03 00:12:04 -08:00
+								    get_zmq_socket,
 								    kill_itself_when_parent_died,
 								)
-												Improve: Use TypeBasedDispatcher in DetokenizerManager (#3117)


											
										
										
											2025-02-22 11:50:46 +08:00
+								from sglang.utils import (
 								    TypeBasedDispatcher,
 								    find_printable_text,
 								    get_exception_traceback,
 								)
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
-												Improve process creation (#1534)


											
										
										
											2024-09-29 02:36:12 -07:00
+								logger = logging.getLogger(__name__)
-												[#2812] Make the decode status dict capcity adjustable by a CLI param (#2839)


											
										
										
											2025-01-20 04:36:53 +09:00
+								# Maximum number of request states that detokenizer can hold. When exceeded,
 								# oldest request states will be evicted. Default: 65536 (1<<16).
 								# For more details, see: https://github.com/sgl-project/sglang/issues/2812
 								# Use power of 2 values for better memory allocation.
 								DETOKENIZER_MAX_STATES = int(os.environ.get("SGLANG_DETOKENIZER_MAX_STATES", 1 << 16))
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
-												Detokenize incrementally when streaming (#653)


											
										
										
											2024-07-18 17:57:40 -07:00
+								@dataclasses.dataclass
 								class DecodeStatus:
-												[Minor] Improve logging and rename the health check endpoint name (#1180)


											
										
										
											2024-08-21 19:24:36 -07:00
+								    """Store the status of incremental decoding."""
-												Detokenize incrementally when streaming (#653)


											
										
										
											2024-07-18 17:57:40 -07:00
+								    decoded_text: str
 								    decode_ids: List[int]
 								    surr_offset: int
 								    read_offset: int
-												Support incremental streaming of logprob/token_ids between scheduler and detokenizer (#6225)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
											
										
										
											2025-05-12 14:33:38 -07:00
+								    # Offset that's sent to tokenizer for incremental update.
 								    sent_offset: int = 0
-												Detokenize incrementally when streaming (#653)


											
										
										
											2024-07-18 17:57:40 -07:00
-												[1/2] Refactor multi-tokenizer manager (#10074)


											
										
										
											2025-09-07 19:13:34 +08:00
+								class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
-												[Minor] Improve logging and rename the health check endpoint name (#1180)


											
										
										
											2024-08-21 19:24:36 -07:00
+								    """DetokenizerManager is a process that detokenizes the token ids."""
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
+								    def __init__(
 								        self,
 								        server_args: ServerArgs,
 								        port_args: PortArgs,
 								    ):
-												[Minor] Improve the function organization in TokenizerManager & improve loggers (#1208)


											
										
										
											2024-08-25 14:46:34 -07:00
+								        # Init inter-process communication
-												Make detokenizer_manager.py not asyncio (#1532)


											
										
										
											2024-09-28 19:33:09 -07:00
+								        context = zmq.Context(2)
-												Fix possible ZMQ hanging (#1800)


											
										
										
											2024-10-25 23:07:07 -07:00
+								        self.recv_from_scheduler = get_zmq_socket(
-												Fix zmq binding (#2930)

Co-authored-by: Chunyuan WU <chunyuan.wu@intel.com>
											
										
										
											2025-01-16 14:36:07 -08:00
+								            context, zmq.PULL, port_args.detokenizer_ipc_name, True
-												Fix possible ZMQ hanging (#1800)


											
										
										
											2024-10-25 23:07:07 -07:00
+								        )
 								        self.send_to_tokenizer = get_zmq_socket(
-												Fix zmq binding (#2930)

Co-authored-by: Chunyuan WU <chunyuan.wu@intel.com>
											
										
										
											2025-01-16 14:36:07 -08:00
+								            context, zmq.PUSH, port_args.tokenizer_ipc_name, False
-												Fix possible ZMQ hanging (#1800)


											
										
										
											2024-10-25 23:07:07 -07:00
+								        )
-												Revert "Support Multi Process Tokenizer Manager" (#8960)


											
										
										
											2025-08-08 02:28:27 -07:00
-												Add skip_tokenizer_init args. (#959)

Co-authored-by: lzhang <zhanglei@modelbest.cn>
											
										
										
											2024-08-10 03:14:13 +08:00
+								        if server_args.skip_tokenizer_init:
 								            self.tokenizer = None
 								        else:
 								            self.tokenizer = get_tokenizer(
 								                server_args.tokenizer_path,
 								                tokenizer_mode=server_args.tokenizer_mode,
 								                trust_remote_code=server_args.trust_remote_code,
-												fix missing revision arg when loading tokenizer (#2982)


											
										
										
											2025-01-19 20:36:07 +01:00
+								                revision=server_args.revision,
-												Add skip_tokenizer_init args. (#959)

Co-authored-by: lzhang <zhanglei@modelbest.cn>
											
										
										
											2024-08-10 03:14:13 +08:00
+								            )
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
-												[#2812] Make the decode status dict capcity adjustable by a CLI param (#2839)


											
										
										
											2025-01-20 04:36:53 +09:00
+								        self.decode_status = LimitedCapacityDict(capacity=DETOKENIZER_MAX_STATES)
-												Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
Co-authored-by: dhou-xai <dhou@x.ai>
Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>

											
										
										
											2025-03-03 00:12:04 -08:00
+								        self.is_dummy = server_args.load_format == "dummy"
-												Revert "Support Multi Process Tokenizer Manager" (#8960)


											
										
										
											2025-08-08 02:28:27 -07:00
-												Improve: Use TypeBasedDispatcher in DetokenizerManager (#3117)


											
										
										
											2025-02-22 11:50:46 +08:00
+								        self._request_dispatcher = TypeBasedDispatcher(
 								            [
 								                (BatchEmbeddingOut, self.handle_batch_embedding_out),
 								                (BatchTokenIDOut, self.handle_batch_token_id_out),
-												Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
Co-authored-by: dhou-xai <dhou@x.ai>
Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>

											
										
										
											2025-03-03 00:12:04 -08:00
+								                (BatchMultimodalDecodeReq, self.handle_multimodal_decode_req),
-												Support Multi Process Tokenizer Manager(#6555) (#8964)

Signed-off-by: ybyang <ybyang7@iflytek.com>
Signed-off-by: huanglong <huanglong@linux.alibaba.com>
Co-authored-by: Huang Long <121648372+LLLL114@users.noreply.github.com>
Co-authored-by: huanglong <huanglong@linux.alibaba.com>
Co-authored-by: Shangming Cai <csmthu@gmail.com>
											
										
										
											2025-09-01 16:00:13 +08:00
+								                (MultiTokenizerRegisterReq, lambda x: x),
-												Support GC Freezing to improve latency & throughput (#9241)

Co-authored-by: Chanh Nguyen <cnguyen@linkedin.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
											
										
										
											2025-08-22 22:43:09 -07:00
+								                (FreezeGCReq, self.handle_freeze_gc_req),
-												Improve: Use TypeBasedDispatcher in DetokenizerManager (#3117)


											
										
										
											2025-02-22 11:50:46 +08:00
+								            ]
 								        )
-												Fix Harmony reasoning parser for and auto-separation for gpt-oss models (#9190)

Co-authored-by: Chang Su <chang.s.su@oracle.com>
Co-authored-by: Chayenne <zhaochen20@outlook.com>
Co-authored-by: zhaochenyang20 <zhaochenyang20@gmail.com>
Co-authored-by: minleminzui <2969413251@qq.com>
Co-authored-by: maocheng23 <maocheng@berkeley.edu>
Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
											
										
										
											2025-08-26 00:26:26 +02:00
+								        self.is_tool_call_parser_gpt_oss = server_args.tool_call_parser == "gpt-oss"
-												Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
Co-authored-by: dhou-xai <dhou@x.ai>
Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>

											
										
										
											2025-03-03 00:12:04 -08:00
+								    def event_loop(self):
 								        """The event loop that handles requests"""
 								        while True:
-												Revert "Support Multi Process Tokenizer Manager" (#8960)


											
										
										
											2025-08-08 02:28:27 -07:00
+								            recv_obj = self.recv_from_scheduler.recv_pyobj()
 								            output = self._request_dispatcher(recv_obj)
-												Support GC Freezing to improve latency & throughput (#9241)

Co-authored-by: Chanh Nguyen <cnguyen@linkedin.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
											
										
										
											2025-08-22 22:43:09 -07:00
+								            if output is not None:
 								                self.send_to_tokenizer.send_pyobj(output)
-												Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
Co-authored-by: dhou-xai <dhou@x.ai>
Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>

											
										
										
											2025-03-03 00:12:04 -08:00
-												Simplify stream_output (#2398)


											
										
										
											2024-12-08 12:27:13 -08:00
+								    def trim_matched_stop(
 								        self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool
 								    ):
 								        if no_stop_trim or not finished_reason:
 								            return output
 								        matched = finished_reason.get("matched", None)
 								        if not matched:
-												[Fix] fix eos trim inconsistency (#1650)


											
										
										
											2024-10-13 01:07:09 -07:00
+								            return output
-												Simplify stream_output (#2398)


											
										
										
											2024-12-08 12:27:13 -08:00
+								        # TODO(lmzheng): handle the case where multiple stop strs are hit
 								        # Trim stop str.
 								        if isinstance(matched, str) and isinstance(output, str):
 								            pos = output.find(matched)
-												[Fix] fix eos trim inconsistency (#1650)


											
										
										
											2024-10-13 01:07:09 -07:00
+								            return output[:pos] if pos != -1 else output
-												Simplify stream_output (#2398)


											
										
										
											2024-12-08 12:27:13 -08:00
 								        # Trim stop token.
 								        if isinstance(matched, int) and isinstance(output, list):
-												Fix Harmony reasoning parser for and auto-separation for gpt-oss models (#9190)

Co-authored-by: Chang Su <chang.s.su@oracle.com>
Co-authored-by: Chayenne <zhaochen20@outlook.com>
Co-authored-by: zhaochenyang20 <zhaochenyang20@gmail.com>
Co-authored-by: minleminzui <2969413251@qq.com>
Co-authored-by: maocheng23 <maocheng@berkeley.edu>
Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
											
										
										
											2025-08-26 00:26:26 +02:00
+								            # 200012 <|call|> is the tool call token and one of eos tokens for gpt-oss model
 								            if output[-1] == 200012 and self.is_tool_call_parser_gpt_oss:
 								                return output
-												[Fix] fix eos trim inconsistency (#1650)


											
										
										
											2024-10-13 01:07:09 -07:00
+								            assert len(output) > 0
 								            return output[:-1]
 								        return output
-												Improve: Use TypeBasedDispatcher in DetokenizerManager (#3117)


											
										
										
											2025-02-22 11:50:46 +08:00
+								    def handle_batch_embedding_out(self, recv_obj: BatchEmbeddingOut):
 								        # If it is embedding model, no detokenization is needed.
 								        return recv_obj
 								    def handle_batch_token_id_out(self, recv_obj: BatchTokenIDOut):
 								        bs = len(recv_obj.rids)
 								        # Initialize decode status
 								        read_ids, surr_ids = [], []
 								        for i in range(bs):
 								            rid = recv_obj.rids[i]
-												Misc clean up; Remove the support of jump forward (#4032)


											
										
										
											2025-03-03 07:02:14 -08:00
+								            if rid not in self.decode_status:
-												Improve: Use TypeBasedDispatcher in DetokenizerManager (#3117)


											
										
										
											2025-02-22 11:50:46 +08:00
+								                s = DecodeStatus(
 								                    decoded_text=recv_obj.decoded_texts[i],
 								                    decode_ids=recv_obj.decode_ids[i],
 								                    surr_offset=0,
 								                    read_offset=recv_obj.read_offsets[i],
 								                )
 								                self.decode_status[rid] = s
-												[Fix] Fix --skip-tokenizer-init (#1798)


											
										
										
											2024-10-25 18:51:59 -07:00
+								            else:
-												Improve: Use TypeBasedDispatcher in DetokenizerManager (#3117)


											
										
										
											2025-02-22 11:50:46 +08:00
+								                s = self.decode_status[rid]
-												Support incremental streaming of logprob/token_ids between scheduler and detokenizer (#6225)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
											
										
										
											2025-05-12 14:33:38 -07:00
+								                s.decode_ids.extend(recv_obj.decode_ids[i])
-												Improve: Use TypeBasedDispatcher in DetokenizerManager (#3117)


											
										
										
											2025-02-22 11:50:46 +08:00
 								            read_ids.append(
 								                self.trim_matched_stop(
 								                    s.decode_ids[s.surr_offset :],
 								                    recv_obj.finished_reasons[i],
 								                    recv_obj.no_stop_trim[i],
-												[Fix] fix eos trim inconsistency (#1650)


											
										
										
											2024-10-13 01:07:09 -07:00
+								                )
-												Fix rid state map leak + Refractor .finished (#505)

Co-authored-by: ZX <zx@lbx.dev>
											
										
										
											2024-06-08 04:20:40 +08:00
+								            )
-												Improve: Use TypeBasedDispatcher in DetokenizerManager (#3117)


											
										
										
											2025-02-22 11:50:46 +08:00
+								            surr_ids.append(s.decode_ids[s.surr_offset : s.read_offset])
-												Fix rid state map leak + Refractor .finished (#505)

Co-authored-by: ZX <zx@lbx.dev>
											
										
										
											2024-06-08 04:20:40 +08:00
-												Improve: Use TypeBasedDispatcher in DetokenizerManager (#3117)


											
										
										
											2025-02-22 11:50:46 +08:00
+								        # TODO(lmzheng): handle skip_special_tokens/spaces_between_special_tokens per request
 								        surr_texts = self.tokenizer.batch_decode(
 								            surr_ids,
 								            skip_special_tokens=recv_obj.skip_special_tokens[0],
 								            spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
 								        )
 								        read_texts = self.tokenizer.batch_decode(
 								            read_ids,
 								            skip_special_tokens=recv_obj.skip_special_tokens[0],
 								            spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
 								        )
 								        # Incremental decoding
 								        output_strs = []
 								        for i in range(bs):
 								            try:
 								                s = self.decode_status[recv_obj.rids[i]]
 								            except KeyError:
 								                raise RuntimeError(
 								                    f"Decode status not found for request {recv_obj.rids[i]}. "
 								                    "It may be due to the request being evicted from the decode status due to memory pressure. "
 								                    "Please increase the maximum number of requests by setting "
 								                    "the SGLANG_DETOKENIZER_MAX_STATES environment variable to a bigger value than the default value. "
 								                    f"The current value is {DETOKENIZER_MAX_STATES}. "
 								                    "For more details, see: https://github.com/sgl-project/sglang/issues/2812"
-												[Fix] fix eos trim inconsistency (#1650)


											
										
										
											2024-10-13 01:07:09 -07:00
+								                )
-												Improve: Use TypeBasedDispatcher in DetokenizerManager (#3117)


											
										
										
											2025-02-22 11:50:46 +08:00
+								            new_text = read_texts[i][len(surr_texts[i]) :]
 								            if recv_obj.finished_reasons[i] is None:
 								                # Streaming chunk: update the decode status
 								                if len(new_text) > 0 and not new_text.endswith("<EFBFBD>"):
 								                    s.decoded_text = s.decoded_text + new_text
 								                    s.surr_offset = s.read_offset
 								                    s.read_offset = len(s.decode_ids)
 								                    new_text = ""
 								                else:
 								                    new_text = find_printable_text(new_text)
-												Fix rid state map leak + Refractor .finished (#505)

Co-authored-by: ZX <zx@lbx.dev>
											
										
										
											2024-06-08 04:20:40 +08:00
-												Support incremental streaming of logprob/token_ids between scheduler and detokenizer (#6225)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
											
										
										
											2025-05-12 14:33:38 -07:00
+								            output_str = self.trim_matched_stop(
 								                s.decoded_text + new_text,
 								                recv_obj.finished_reasons[i],
 								                recv_obj.no_stop_trim[i],
-												Fix rid state map leak + Refractor .finished (#505)

Co-authored-by: ZX <zx@lbx.dev>
											
										
										
											2024-06-08 04:20:40 +08:00
+								            )
-												Support incremental streaming of logprob/token_ids between scheduler and detokenizer (#6225)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
											
										
										
											2025-05-12 14:33:38 -07:00
+								            # Incrementally send text.
 								            incremental_output = output_str[s.sent_offset :]
 								            s.sent_offset = len(output_str)
 								            output_strs.append(incremental_output)
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
-												Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
Co-authored-by: dhou-xai <dhou@x.ai>
Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>

											
										
										
											2025-03-03 00:12:04 -08:00
+								        return BatchStrOut(
-												Improve: Use TypeBasedDispatcher in DetokenizerManager (#3117)


											
										
										
											2025-02-22 11:50:46 +08:00
+								            rids=recv_obj.rids,
 								            finished_reasons=recv_obj.finished_reasons,
 								            output_strs=output_strs,
-												Revert "bugfix: Fix output_ids extraction in detokenizer_manager" (#9467)


											
										
										
											2025-08-22 08:24:25 +08:00
+								            output_ids=recv_obj.decode_ids,
-												Improve: Use TypeBasedDispatcher in DetokenizerManager (#3117)


											
										
										
											2025-02-22 11:50:46 +08:00
+								            prompt_tokens=recv_obj.prompt_tokens,
 								            completion_tokens=recv_obj.completion_tokens,
 								            cached_tokens=recv_obj.cached_tokens,
 								            spec_verify_ct=recv_obj.spec_verify_ct,
 								            input_token_logprobs_val=recv_obj.input_token_logprobs_val,
 								            input_token_logprobs_idx=recv_obj.input_token_logprobs_idx,
 								            output_token_logprobs_val=recv_obj.output_token_logprobs_val,
 								            output_token_logprobs_idx=recv_obj.output_token_logprobs_idx,
 								            input_top_logprobs_val=recv_obj.input_top_logprobs_val,
 								            input_top_logprobs_idx=recv_obj.input_top_logprobs_idx,
 								            output_top_logprobs_val=recv_obj.output_top_logprobs_val,
 								            output_top_logprobs_idx=recv_obj.output_top_logprobs_idx,
-												Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
Co-authored-by: dhou-xai <dhou@x.ai>
Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>

											
										
										
											2025-03-03 00:12:04 -08:00
+								            input_token_ids_logprobs_val=recv_obj.input_token_ids_logprobs_val,
 								            input_token_ids_logprobs_idx=recv_obj.input_token_ids_logprobs_idx,
 								            output_token_ids_logprobs_val=recv_obj.output_token_ids_logprobs_val,
 								            output_token_ids_logprobs_idx=recv_obj.output_token_ids_logprobs_idx,
-												Improve: Use TypeBasedDispatcher in DetokenizerManager (#3117)


											
										
										
											2025-02-22 11:50:46 +08:00
+								            output_hidden_states=recv_obj.output_hidden_states,
-												[Auto Sync] Update io_struct.py (20250909) (#10236)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: cctry <shiyang@x.ai>
											
										
										
											2025-09-09 14:15:21 -07:00
+								            placeholder_tokens_idx=None,
 								            placeholder_tokens_val=None,
-												Improve: Use TypeBasedDispatcher in DetokenizerManager (#3117)


											
										
										
											2025-02-22 11:50:46 +08:00
+								        )
-												Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
Co-authored-by: dhou-xai <dhou@x.ai>
Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>

											
										
										
											2025-03-03 00:12:04 -08:00
+								    def handle_multimodal_decode_req(self, recv_obj: BatchMultimodalDecodeReq):
-												Support incremental streaming of logprob/token_ids between scheduler and detokenizer (#6225)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
											
										
										
											2025-05-12 14:33:38 -07:00
+								        outputs = self.tokenizer.detokenize(recv_obj)
 								        return BatchMultimodalOut(
 								            rids=recv_obj.rids,
 								            finished_reasons=recv_obj.finished_reasons,
 								            outputs=outputs,
 								            prompt_tokens=recv_obj.prompt_tokens,
 								            completion_tokens=recv_obj.completion_tokens,
 								            cached_tokens=recv_obj.cached_tokens,
-												[Auto Sync] Update io_struct.py (20250909) (#10236)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: cctry <shiyang@x.ai>
											
										
										
											2025-09-09 14:15:21 -07:00
+								            placeholder_tokens_idx=None,
 								            placeholder_tokens_val=None,
-												Support incremental streaming of logprob/token_ids between scheduler and detokenizer (#6225)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
											
										
										
											2025-05-12 14:33:38 -07:00
+								        )
-												[bug] fixed batch api for DeepSeek V3/R1 (#3754)


											
										
										
											2025-02-22 02:28:16 +08:00
-												Support GC Freezing to improve latency & throughput (#9241)

Co-authored-by: Chanh Nguyen <cnguyen@linkedin.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
											
										
										
											2025-08-22 22:43:09 -07:00
+								    def handle_freeze_gc_req(self, recv_req: FreezeGCReq):
 								        freeze_gc("Detokenizer Manager")
 								        return None
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
-												Improve process creation (#1534)


											
										
										
											2024-09-29 02:36:12 -07:00
+								class LimitedCapacityDict(OrderedDict):
-												[#2812] Make the decode status dict capcity adjustable by a CLI param (#2839)


											
										
										
											2025-01-20 04:36:53 +09:00
+								    def __init__(self, capacity: int, *args, **kwargs):
-												Improve process creation (#1534)


											
										
										
											2024-09-29 02:36:12 -07:00
+								        super().__init__(*args, **kwargs)
 								        self.capacity = capacity
 								    def __setitem__(self, key, value):
 								        if len(self) >= self.capacity:
 								            # Remove the oldest element (first item in the dict)
 								            self.popitem(last=False)
 								        # Set the new item
 								        super().__setitem__(key, value)
 								def run_detokenizer_process(
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
+								    server_args: ServerArgs,
 								    port_args: PortArgs,
 								):
-												Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
Co-authored-by: dhou-xai <dhou@x.ai>
Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>

											
										
										
											2025-03-03 00:12:04 -08:00
+								    kill_itself_when_parent_died()
-												nit: Remove busy waiting on scheduler (#2382)


											
										
										
											2024-12-08 01:06:15 -08:00
+								    setproctitle.setproctitle("sglang::detokenizer")
-												Improve process creation (#1534)


											
										
										
											2024-09-29 02:36:12 -07:00
+								    configure_logger(server_args)
-												Crash the server correctly during error (#2231)


											
										
										
											2024-11-28 00:22:39 -08:00
+								    parent_process = psutil.Process().parent()
-												Improve process creation (#1534)


											
										
										
											2024-09-29 02:36:12 -07:00
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
+								    try:
 								        manager = DetokenizerManager(server_args, port_args)
-												Support Multi Process Tokenizer Manager(#6555) (#8964)

Signed-off-by: ybyang <ybyang7@iflytek.com>
Signed-off-by: huanglong <huanglong@linux.alibaba.com>
Co-authored-by: Huang Long <121648372+LLLL114@users.noreply.github.com>
Co-authored-by: huanglong <huanglong@linux.alibaba.com>
Co-authored-by: Shangming Cai <csmthu@gmail.com>
											
										
										
											2025-09-01 16:00:13 +08:00
+								        if server_args.tokenizer_worker_num > 1:
-												[1/2] Refactor multi-tokenizer manager (#10074)


											
										
										
											2025-09-07 19:13:34 +08:00
+								            manager.multi_http_worker_event_loop()
-												Support Multi Process Tokenizer Manager(#6555) (#8964)

Signed-off-by: ybyang <ybyang7@iflytek.com>
Signed-off-by: huanglong <huanglong@linux.alibaba.com>
Co-authored-by: Huang Long <121648372+LLLL114@users.noreply.github.com>
Co-authored-by: huanglong <huanglong@linux.alibaba.com>
Co-authored-by: Shangming Cai <csmthu@gmail.com>
											
										
										
											2025-09-01 16:00:13 +08:00
+								        else:
 								            manager.event_loop()
-												Crash the server when error or OOM happens (#514)


											
										
										
											2024-06-07 19:22:34 -07:00
+								    except Exception:
-												[1/2] Refactor multi-tokenizer manager (#10074)


											
										
										
											2025-09-07 19:13:34 +08:00
+								        manager.socket_mapping.clear_all_sockets()
-												Crash the server correctly during error (#2231)


											
										
										
											2024-11-28 00:22:39 -08:00
+								        traceback = get_exception_traceback()
 								        logger.error(f"DetokenizerManager hit an exception: {traceback}")
 								        parent_process.send_signal(signal.SIGQUIT)