[feat] Refactor session control interface and add CI (#2173)

2024-11-25 12:32:51 -08:00
parent 5ada33ffa0
commit e1e595d702
8 changed files with 180 additions and 154 deletions
--- a/python/sglang/srt/managers/detokenizer_manager.py
+++ b/python/sglang/srt/managers/detokenizer_manager.py
@@ -173,7 +173,6 @@ class DetokenizerManager:
                    output_strs=output_strs,
                    meta_info=recv_obj.meta_info,
                    finished_reason=recv_obj.finished_reason,
                    session_ids=recv_obj.session_ids,
                )
            )
--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -19,7 +19,7 @@ processes (TokenizerManager, DetokenizerManager, Controller).
 import uuid
 from dataclasses import dataclass
 from enum import Enum
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 from sglang.srt.managers.schedule_batch import BaseFinishReason
 from sglang.srt.sampling.sampling_params import SamplingParams
@@ -55,8 +55,9 @@ class GenerateReqInput:
    lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
    # Session id info for continual prompting
-    session_id: Optional[Union[List[str], str]] = None
+    session: Optional[
-    session_rid: Optional[Union[List[str], str]] = None
+        Union[List[Tuple[str, Optional[str]]], Tuple[str, Optional[str]]]
    ] = None
    def normalize_batch_and_arguments(self):
        if (self.text is None and self.input_ids is None) or (
@@ -203,7 +204,7 @@ class TokenizedGenerateReqInput:
    lora_path: Optional[str] = None  # None means just use the base model
    # Session id info for continual prompting
-    session_id: Optional[int] = None
+    session_id: Optional[str] = None
    session_rid: Optional[str] = None
@@ -299,8 +300,6 @@ class BatchTokenIDOut:
    meta_info: List[Dict]
    finished_reason: List[BaseFinishReason]
    no_stop_trim: List[bool]
    # The updated session unique id
    session_ids: List[str]
@dataclass
@@ -313,8 +312,6 @@ class BatchStrOut:
    meta_info: List[Dict]
    # The finish reason
    finished_reason: List[BaseFinishReason]
    # The update session unique id
    session_ids: List[str]
@dataclass
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -542,9 +542,7 @@ class Scheduler:
        else:
            # Handle sessions
            session = self.sessions[recv_req.session_id]
-            req, new_session_id = session.create_req(recv_req, self.tokenizer)
+            req = session.create_req(recv_req, self.tokenizer)
            del self.sessions[recv_req.session_id]
            self.sessions[new_session_id] = session
            if isinstance(req.finished_reason, FINISH_ABORT):
                self.waiting_queue.append(req)
                return
@@ -1188,7 +1186,6 @@ class Scheduler:
            output_skip_special_tokens = []
            output_spaces_between_special_tokens = []
            output_no_stop_trim = []
            output_session_ids = []
        else:  # embedding or reward model
            output_embeddings = []
@@ -1216,7 +1213,6 @@ class Scheduler:
                        req.sampling_params.spaces_between_special_tokens
                    )
                    output_no_stop_trim.append(req.sampling_params.no_stop_trim)
                    output_session_ids.append(req.session_id)
                    meta_info = {
                        "prompt_tokens": len(req.origin_input_ids),
@@ -1267,7 +1263,6 @@ class Scheduler:
                        output_meta_info,
                        output_finished_reason,
                        output_no_stop_trim,
                        output_session_ids,
                    )
                )
            else:  # embedding or reward model
--- a/python/sglang/srt/managers/session_controller.py
+++ b/python/sglang/srt/managers/session_controller.py
@@ -26,13 +26,13 @@ class Session:
        self.reqs: List[Req] = []
    def create_req(self, req: TokenizedGenerateReqInput, tokenizer):
        # renew session id
        self.session_id = uuid.uuid4().hex
        if req.session_rid is not None:
            while len(self.reqs) > 0:
                if self.reqs[-1].rid == req.session_rid:
                    break
                self.reqs = self.reqs[:-1]
        else:
            self.reqs = []
        if len(self.reqs) > 0:
            input_ids = (
                self.reqs[-1].origin_input_ids
@@ -58,4 +58,4 @@ class Session:
            )
        else:
            self.reqs.append(new_req)
-        return new_req, self.session_id
+        return new_req
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -216,8 +216,8 @@ class TokenizerManager:
            return_logprob = obj.return_logprob
            logprob_start_len = obj.logprob_start_len
            top_logprobs_num = obj.top_logprobs_num
-            session_id = obj.session_id
+            session_id = obj.session[0] if obj.session else None
-            session_rid = obj.session_rid
+            session_rid = obj.session[1] if obj.session else None
        if len(input_ids) >= self.context_len:
            raise ValueError(
@@ -570,13 +570,11 @@ class TokenizerManager:
                    out_dict = {
                        "text": recv_obj.output_strs[i],
                        "meta_info": recv_obj.meta_info[i],
                        "session_id": recv_obj.session_ids[i],
                    }
                elif isinstance(recv_obj, BatchTokenIDOut):
                    out_dict = {
                        "token_ids": recv_obj.output_ids[i],
                        "meta_info": recv_obj.meta_info[i],
                        "session_id": recv_obj.session_ids[i],
                    }
                else:
                    assert isinstance(recv_obj, BatchEmbeddingOut)
--- a/scripts/playground/test_session_id.py
+++ b/scripts/playground/test_session_id.py
@@ -1,132 +0,0 @@
 # Copyright 2023-2024 SGLang Team
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #     http://www.apache.org/licenses/LICENSE-2.0
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 # FIXME: Make it a CI test
 import requests
 from sglang.srt.hf_transformers_utils import get_tokenizer
 url = "http://localhost:30000"
 # Open a session
 response = requests.post(
    url + "/open_session",
    json={"capacity_of_str_len": 1000},
 )
 session_id = response.json()
 print("session_id", session_id, "\n")
 # Prefill only
 prompt = "chunk 1"
 response = requests.post(
    url + "/generate",
    json={
        "text": prompt,
        "session_id": session_id,
        "sampling_params": {
            "temperature": 0,
            "max_new_tokens": 0,
        },
    },
 )
 print(response.json(), "\n")
 session_id = response.json()["session_id"]
 # Generate
 prompt = "Chunk 2"
 response = requests.post(
    url + "/generate",
    json={
        "text": prompt,
        "session_id": session_id,
        "sampling_params": {
            "temperature": 0,
            "max_new_tokens": 16,
        },
    },
 )
 print(response.json(), "\n")
 session_id = response.json()["session_id"]
 rid = response.json()["meta_info"]["id"]
 # Generate
 prompt = "Chunk 3"
 response = requests.post(
    url + "/generate",
    json={
        "text": prompt,
        "session_id": session_id,
        "sampling_params": {
            "temperature": 0,
            "max_new_tokens": 2,
        },
    },
 )
 print(response.json(), "\n")
 session_id = response.json()["session_id"]
 rid_to_del = response.json()["meta_info"]["id"]
 # Interrupt and re-generate
 prompt = "Chunk 4"
 response = requests.post(
    url + "/generate",
    json={
        "text": prompt,
        "session_id": session_id,
        "session_rid": rid,
        "sampling_params": {
            "temperature": 0,
            "max_new_tokens": 16,
        },
    },
 )
 print(response.json(), "\n")
 session_id = response.json()["session_id"]
 # Query a session based on a deleted request, should see finish reason abort
 prompt = "Chunk 4"
 response = requests.post(
    url + "/generate",
    json={
        "text": prompt,
        "session_id": session_id,
        "session_rid": rid_to_del,
        "sampling_params": {
            "temperature": 0,
            "max_new_tokens": 16,
        },
    },
 )
 print(response.json(), "\n")
 # Close session
 ret = requests.post(
    url + "/close_session",
    json={"session_id": session_id},
 )
 print(ret, "\n")
 # Query a deleted session, should see finish reason abort
 prompt = "chunk 1"
 response = requests.post(
    url + "/generate",
    json={
        "text": prompt,
        "session_id": session_id,
        "sampling_params": {
            "temperature": 0,
            "max_new_tokens": 0,
        },
    },
 )
 print(response.json(), "\n")
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -34,6 +34,7 @@ suites = {
        "test_triton_attention_backend.py",
        "test_update_weights.py",
        "test_vision_openai_server.py",
        "test_session_control.py",
    ],
    "sampling/penaltylib": glob.glob(
        "sampling/penaltylib/**/test_*.py", recursive=True
--- a/test/srt/test_session_control.py
+++ b/test/srt/test_session_control.py
@@ -0,0 +1,168 @@
 """
 Usage:
 python3 -m unittest test_session_control.TestSessionControl.test_session_control
 python3 -m unittest test_session_control.TestSessionControl.test_session_control_vlm
 """
 import unittest
 import requests
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import (
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    popen_launch_server,
 )
 class TestSessionControl(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.process = popen_launch_server(
            cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
        )
    @classmethod
    def tearDownClass(cls):
        kill_child_process(cls.process.pid, include_self=True)
    def test_session_control(self):
        chunks = [
            "Let me tell you something about France.",
            "The capital of France is",
            "A brief history about that city is",
            "To plan a travel, the budget is",
        ]
        tokenizer = get_tokenizer(self.model)
        chunks_ids = [tokenizer.encode(x) for x in chunks]
        # 1. using session control
        session_id = requests.post(
            self.base_url + "/open_session",
            json={"capacity_of_str_len": 1000},
        ).json()
        rid = None
        first_rid = None
        outputs_from_session = []
        for i, chunk_ids in enumerate(chunks_ids):
            response = requests.post(
                self.base_url + "/generate",
                json={
                    "input_ids": chunk_ids,
                    "session": [session_id, rid],
                    "sampling_params": {
                        "temperature": 0,
                        "max_new_tokens": (
                            16 if i > 0 else 0
                        ),  # prefill only for the first chunk
                    },
                },
            ).json()
            rid = response["meta_info"]["id"]
            if i == 0:
                first_rid = rid
            if i > 0:
                outputs_from_session.append(response["text"])
        # backtrack to the first request and regenerate
        response = requests.post(
            self.base_url + "/generate",
            json={
                "input_ids": chunks_ids[-1],
                "session": [session_id, first_rid],
                "sampling_params": {
                    "temperature": 0,
                    "max_new_tokens": 16,
                },
            },
        ).json()
        outputs_from_session.append(response["text"])
        # query with a non-existing rid (the last one should be disappeared becuase of backtrack), should see abort
        response = requests.post(
            self.base_url + "/generate",
            json={
                "input_ids": chunks_ids[-1],
                "session": [session_id, rid],
                "sampling_params": {
                    "temperature": 0,
                    "max_new_tokens": 16,
                },
            },
        ).json()
        assert response["meta_info"]["finish_reason"]["type"] == "abort"
        ret = requests.post(
            self.base_url + "/close_session",
            json={"session_id": session_id},
        )
        assert ret.status_code == 200
        # send a request to a closed session, should see abort
        response = requests.post(
            self.base_url + "/generate",
            json={
                "input_ids": chunks_ids[-1],
                "session": [session_id, first_rid],
                "sampling_params": {
                    "temperature": 0,
                    "max_new_tokens": 16,
                },
            },
        ).json()
        assert response["meta_info"]["finish_reason"]["type"] == "abort"
        # 2. not use session control
        input_ids_first_req = None
        input_ids = []
        outputs_normal = []
        for i, chunk_ids in enumerate(chunks_ids):
            input_ids += chunk_ids
            response = requests.post(
                self.base_url + "/generate",
                json={
                    "input_ids": input_ids,
                    "sampling_params": {
                        "temperature": 0,
                        "max_new_tokens": (
                            16 if i > 0 else 0
                        ),  # prefill only for the first chunk
                    },
                },
            ).json()
            if i > 0:
                input_ids += tokenizer.encode(response["text"])[
                    1:
                ]  # drop the bos token
                outputs_normal.append(response["text"])
            if i == 0:
                input_ids_first_req = input_ids.copy()
        input_ids_first_req += chunks_ids[-1]
        response = requests.post(
            self.base_url + "/generate",
            json={
                "input_ids": input_ids_first_req,
                "sampling_params": {
                    "temperature": 0,
                    "max_new_tokens": 16,
                },
            },
        ).json()
        outputs_normal.append(response["text"])
        print("outputs from chunked queries with session control:")
        print(outputs_from_session)
        print("outputs from normal queries:")
        print(outputs_normal)
        assert outputs_from_session == outputs_normal
 if __name__ == "__main__":
    unittest.main()