sglang/python/sglang/srt/managers/detokenizer_manager.py

"""
Copyright 2023-2024 SGLang Team
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

"""DetokenizerManager is a process that detokenizes the token ids."""

import asyncio
import dataclasses
import inspect
from typing import List

import uvloop
import zmq
import zmq.asyncio

from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.managers.controller.infer_batch import FINISH_MATCHED_STR
from sglang.srt.managers.io_struct import BatchStrOut, BatchTokenIDOut
from sglang.srt.server_args import PortArgs, ServerArgs
from sglang.utils import find_printable_text, get_exception_traceback, graceful_registry

asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())


@dataclasses.dataclass
class DecodeStatus:
    vid: int
    decoded_text: str
    decode_ids: List[int]
    surr_offset: int
    read_offset: int


class DetokenizerManager:
    def __init__(
        self,
        server_args: ServerArgs,
        port_args: PortArgs,
    ):
        context = zmq.asyncio.Context(2)
        self.recv_from_router = context.socket(zmq.PULL)
        self.recv_from_router.bind(f"tcp://127.0.0.1:{port_args.detokenizer_port}")

        self.send_to_tokenizer = context.socket(zmq.PUSH)
        self.send_to_tokenizer.connect(f"tcp://127.0.0.1:{port_args.tokenizer_port}")

        self.tokenizer = get_tokenizer(
            server_args.tokenizer_path,
            tokenizer_mode=server_args.tokenizer_mode,
            trust_remote_code=server_args.trust_remote_code,
        )

        self.decode_status = {}

    async def handle_loop(self):
        while True:
            recv_obj: BatchTokenIDOut = await self.recv_from_router.recv_pyobj()
            assert isinstance(recv_obj, BatchTokenIDOut)
            bs = len(recv_obj.rids)

            # Initialize decode status
            read_ids, surr_ids = [], []
            for i in range(bs):
                rid = recv_obj.rids[i]
                vid = recv_obj.vids[i]
                if rid not in self.decode_status or self.decode_status[rid].vid != vid:
                    s = DecodeStatus(
                        vid=vid,
                        decoded_text=recv_obj.decoded_texts[i],
                        decode_ids=recv_obj.decode_ids[i],
                        surr_offset=0,
                        read_offset=recv_obj.read_offsets[i],
                    )
                    self.decode_status[rid] = s
                else:
                    s = self.decode_status[rid]
                    s.decode_ids = recv_obj.decode_ids[i]

                read_ids.append(s.decode_ids[s.surr_offset :])
                surr_ids.append(s.decode_ids[s.surr_offset : s.read_offset])

            # TODO(lmzheng): handle skip_special_tokens/spaces_between_special_tokens per request
            surr_texts = self.tokenizer.batch_decode(
                surr_ids,
                skip_special_tokens=recv_obj.skip_special_tokens[0],
                spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
            )
            read_texts = self.tokenizer.batch_decode(
                read_ids,
                skip_special_tokens=recv_obj.skip_special_tokens[0],
                spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
            )

            # Trim stop str
            # TODO(lmzheng): handle the case where multiple stop strs are hit
            output_strs = []
            for i in range(bs):
                s = self.decode_status[recv_obj.rids[i]]
                new_text = read_texts[i][len(surr_texts[i]) :]
                if recv_obj.finished_reason[i] is None:
                    # Streaming chunk: update the decode status
                    if len(new_text) > 0 and not new_text.endswith("<EFBFBD>"):
                        s.decoded_text = s.decoded_text + new_text
                        s.surr_offset = s.read_offset
                        s.read_offset = len(s.decode_ids)
                        new_text = ""
                    else:
                        new_text = find_printable_text(new_text)

                output_strs.append(s.decoded_text + new_text)

                if isinstance(recv_obj.finished_reason[i], FINISH_MATCHED_STR):
                    pos = output_strs[i].find(recv_obj.finished_reason[i].matched)
                    if pos != -1:
                        output_strs[i] = output_strs[i][:pos]

            self.send_to_tokenizer.send_pyobj(
                BatchStrOut(
                    rids=recv_obj.rids,
                    output_strs=output_strs,
                    meta_info=recv_obj.meta_info,
                    finished_reason=recv_obj.finished_reason,
                )
            )


def start_detokenizer_process(
    server_args: ServerArgs,
    port_args: PortArgs,
    pipe_writer,
):
    graceful_registry(inspect.currentframe().f_code.co_name)

    try:
        manager = DetokenizerManager(server_args, port_args)
    except Exception:
        pipe_writer.send(get_exception_traceback())
        raise
    pipe_writer.send("init ok")
    loop = asyncio.get_event_loop()
    loop.run_until_complete(manager.handle_loop())
-												chore: add copyright for srt (#790)


											
										
										
											2024-07-28 23:07:12 +10:00
+								"""
 								Copyright 2023-2024 SGLang Team
 								Licensed under the Apache License, Version 2.0 (the "License");
 								you may not use this file except in compliance with the License.
 								You may obtain a copy of the License at
 								    http://www.apache.org/licenses/LICENSE-2.0
 								Unless required by applicable law or agreed to in writing, software
 								distributed under the License is distributed on an "AS IS" BASIS,
 								WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								See the License for the specific language governing permissions and
 								limitations under the License.
 								"""
-												Improve doc strings (#518)

											
										
										
											2024-06-08 02:06:52 -07:00
+								"""DetokenizerManager is a process that detokenizes the token ids."""
-												Higher priority for user input of max_prefill_tokens & format (#540)


											
										
										
											2024-06-12 21:48:40 -07:00
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
+								import asyncio
-												Detokenize incrementally when streaming (#653)


											
										
										
											2024-07-18 17:57:40 -07:00
+								import dataclasses
-												Improve logging & add logit cap (#471)


											
										
										
											2024-05-24 03:48:53 -07:00
+								import inspect
-												Detokenize incrementally when streaming (#653)


											
										
										
											2024-07-18 17:57:40 -07:00
+								from typing import List
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
 								import uvloop
 								import zmq
 								import zmq.asyncio
-												add `.isort.cfg` (#378)


											
										
										
											2024-04-22 22:38:09 +08:00
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
+								from sglang.srt.hf_transformers_utils import get_tokenizer
-												Higher priority for user input of max_prefill_tokens & format (#540)


											
										
										
											2024-06-12 21:48:40 -07:00
+								from sglang.srt.managers.controller.infer_batch import FINISH_MATCHED_STR
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
+								from sglang.srt.managers.io_struct import BatchStrOut, BatchTokenIDOut
 								from sglang.srt.server_args import PortArgs, ServerArgs
-												* fix(detokenizer_manager.py): fix truncated decoded output (#586)

Co-authored-by: hnyls2002 <hnyls2002@gmail.com>
											
										
										
											2024-07-07 05:53:22 +08:00
+								from sglang.utils import find_printable_text, get_exception_traceback, graceful_registry
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
 								asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-												Detokenize incrementally when streaming (#653)


											
										
										
											2024-07-18 17:57:40 -07:00
+								@dataclasses.dataclass
 								class DecodeStatus:
-												Fix jump forward when streaming (#665)


											
										
										
											2024-07-19 16:42:06 -07:00
+								    vid: int
-												Detokenize incrementally when streaming (#653)


											
										
										
											2024-07-18 17:57:40 -07:00
+								    decoded_text: str
 								    decode_ids: List[int]
 								    surr_offset: int
 								    read_offset: int
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
+								class DetokenizerManager:
 								    def __init__(
 								        self,
 								        server_args: ServerArgs,
 								        port_args: PortArgs,
 								    ):
 								        context = zmq.asyncio.Context(2)
 								        self.recv_from_router = context.socket(zmq.PULL)
 								        self.recv_from_router.bind(f"tcp://127.0.0.1:{port_args.detokenizer_port}")
 								        self.send_to_tokenizer = context.socket(zmq.PUSH)
 								        self.send_to_tokenizer.connect(f"tcp://127.0.0.1:{port_args.tokenizer_port}")
 								        self.tokenizer = get_tokenizer(
 								            server_args.tokenizer_path,
 								            tokenizer_mode=server_args.tokenizer_mode,
 								            trust_remote_code=server_args.trust_remote_code,
 								        )
-												Detokenize incrementally when streaming (#653)


											
										
										
											2024-07-18 17:57:40 -07:00
+								        self.decode_status = {}
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
+								    async def handle_loop(self):
 								        while True:
-												Fix rid state map leak + Refractor .finished (#505)

Co-authored-by: ZX <zx@lbx.dev>
											
										
										
											2024-06-08 04:20:40 +08:00
+								            recv_obj: BatchTokenIDOut = await self.recv_from_router.recv_pyobj()
 								            assert isinstance(recv_obj, BatchTokenIDOut)
-												Detokenize incrementally when streaming (#653)


											
										
										
											2024-07-18 17:57:40 -07:00
+								            bs = len(recv_obj.rids)
 								            # Initialize decode status
 								            read_ids, surr_ids = [], []
 								            for i in range(bs):
 								                rid = recv_obj.rids[i]
-												Fix jump forward when streaming (#665)


											
										
										
											2024-07-19 16:42:06 -07:00
+								                vid = recv_obj.vids[i]
 								                if rid not in self.decode_status or self.decode_status[rid].vid != vid:
-												Detokenize incrementally when streaming (#653)


											
										
										
											2024-07-18 17:57:40 -07:00
+								                    s = DecodeStatus(
-												Fix jump forward when streaming (#665)


											
										
										
											2024-07-19 16:42:06 -07:00
+								                        vid=vid,
-												Detokenize incrementally when streaming (#653)


											
										
										
											2024-07-18 17:57:40 -07:00
+								                        decoded_text=recv_obj.decoded_texts[i],
 								                        decode_ids=recv_obj.decode_ids[i],
 								                        surr_offset=0,
 								                        read_offset=recv_obj.read_offsets[i],
 								                    )
 								                    self.decode_status[rid] = s
 								                else:
 								                    s = self.decode_status[rid]
 								                    s.decode_ids = recv_obj.decode_ids[i]
 								                read_ids.append(s.decode_ids[s.surr_offset :])
 								                surr_ids.append(s.decode_ids[s.surr_offset : s.read_offset])
-												Fix rid state map leak + Refractor .finished (#505)

Co-authored-by: ZX <zx@lbx.dev>
											
										
										
											2024-06-08 04:20:40 +08:00
 								            # TODO(lmzheng): handle skip_special_tokens/spaces_between_special_tokens per request
-												Decode Incrementally (#517)


											
										
										
											2024-06-12 14:39:12 +08:00
+								            surr_texts = self.tokenizer.batch_decode(
-												Detokenize incrementally when streaming (#653)


											
										
										
											2024-07-18 17:57:40 -07:00
+								                surr_ids,
-												Decode Incrementally (#517)


											
										
										
											2024-06-12 14:39:12 +08:00
+								                skip_special_tokens=recv_obj.skip_special_tokens[0],
 								                spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
 								            )
 								            read_texts = self.tokenizer.batch_decode(
-												Detokenize incrementally when streaming (#653)


											
										
										
											2024-07-18 17:57:40 -07:00
+								                read_ids,
-												Fix rid state map leak + Refractor .finished (#505)

Co-authored-by: ZX <zx@lbx.dev>
											
										
										
											2024-06-08 04:20:40 +08:00
+								                skip_special_tokens=recv_obj.skip_special_tokens[0],
-												Decode Incrementally (#517)


											
										
										
											2024-06-12 14:39:12 +08:00
+								                spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
-												Fix rid state map leak + Refractor .finished (#505)

Co-authored-by: ZX <zx@lbx.dev>
											
										
										
											2024-06-08 04:20:40 +08:00
+								            )
 								            # Trim stop str
 								            # TODO(lmzheng): handle the case where multiple stop strs are hit
-												Decode Incrementally (#517)


											
										
										
											2024-06-12 14:39:12 +08:00
+								            output_strs = []
-												Detokenize incrementally when streaming (#653)


											
										
										
											2024-07-18 17:57:40 -07:00
+								            for i in range(bs):
 								                s = self.decode_status[recv_obj.rids[i]]
-												Decode Incrementally (#517)


											
										
										
											2024-06-12 14:39:12 +08:00
+								                new_text = read_texts[i][len(surr_texts[i]) :]
-												Fix streaming (#600)


											
										
										
											2024-07-07 01:55:58 -07:00
+								                if recv_obj.finished_reason[i] is None:
-												Detokenize incrementally when streaming (#653)


											
										
										
											2024-07-18 17:57:40 -07:00
+								                    # Streaming chunk: update the decode status
 								                    if len(new_text) > 0 and not new_text.endswith("<EFBFBD>"):
 								                        s.decoded_text = s.decoded_text + new_text
 								                        s.surr_offset = s.read_offset
 								                        s.read_offset = len(s.decode_ids)
 								                        new_text = ""
 								                    else:
 								                        new_text = find_printable_text(new_text)
 								                output_strs.append(s.decoded_text + new_text)
-												Fix rid state map leak + Refractor .finished (#505)

Co-authored-by: ZX <zx@lbx.dev>
											
										
										
											2024-06-08 04:20:40 +08:00
 								                if isinstance(recv_obj.finished_reason[i], FINISH_MATCHED_STR):
 								                    pos = output_strs[i].find(recv_obj.finished_reason[i].matched)
 								                    if pos != -1:
 								                        output_strs[i] = output_strs[i][:pos]
 								            self.send_to_tokenizer.send_pyobj(
 								                BatchStrOut(
 								                    rids=recv_obj.rids,
-												* fix(detokenizer_manager.py): fix truncated decoded output (#586)

Co-authored-by: hnyls2002 <hnyls2002@gmail.com>
											
										
										
											2024-07-07 05:53:22 +08:00
+								                    output_strs=output_strs,
-												Fix rid state map leak + Refractor .finished (#505)

Co-authored-by: ZX <zx@lbx.dev>
											
										
										
											2024-06-08 04:20:40 +08:00
+								                    meta_info=recv_obj.meta_info,
 								                    finished_reason=recv_obj.finished_reason,
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
+								                )
-												Fix rid state map leak + Refractor .finished (#505)

Co-authored-by: ZX <zx@lbx.dev>
											
										
										
											2024-06-08 04:20:40 +08:00
+								            )
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
 								def start_detokenizer_process(
 								    server_args: ServerArgs,
 								    port_args: PortArgs,
 								    pipe_writer,
 								):
-												Improve logging & add logit cap (#471)


											
										
										
											2024-05-24 03:48:53 -07:00
+								    graceful_registry(inspect.currentframe().f_code.co_name)
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
+								    try:
 								        manager = DetokenizerManager(server_args, port_args)
-												Crash the server when error or OOM happens (#514)


											
										
										
											2024-06-07 19:22:34 -07:00
+								    except Exception:
-												release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

											
										
										
											2024-01-08 04:37:50 +00:00
+								        pipe_writer.send(get_exception_traceback())
 								        raise
 								    pipe_writer.send("init ok")
 								    loop = asyncio.get_event_loop()
 								    loop.run_until_complete(manager.handle_loop())