[Session] Update session control interface (#2635)

2024-12-29 02:10:27 -08:00
parent 9c05c6898e
commit e0e09fceeb
6 changed files with 531 additions and 91 deletions
--- a/test/srt/test_session_control.py
+++ b/test/srt/test_session_control.py
@@ -1,11 +1,16 @@
 """
 Usage:
 python3 -m unittest test_session_control.TestSessionControl.test_session_control
+python3 -m unittest test_session_control.TestSessionControl.test_session_control_with_branching
+python3 -m unittest test_session_control.TestSessionControl.test_session_control_backtrack_with_abort
 python3 -m unittest test_session_control.TestSessionControlVision.test_session_control
 """

+import asyncio
+import json
 import unittest

+import aiohttp
 import requests

 from sglang.srt.hf_transformers_utils import get_tokenizer
@@ -18,6 +23,10 @@ from sglang.test.test_utils import (
 )


+def remove_prefix(text: str, prefix: str) -> str:
+    return text[len(prefix) :] if text.startswith(prefix) else text
+
+
 class TestSessionControl(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
@@ -31,15 +40,18 @@ class TestSessionControl(unittest.TestCase):
    def tearDownClass(cls):
        kill_process_tree(cls.process.pid)

-    def test_session_control(self):
+    def test_session_control(self, gen_len=12):
        chunks = [
            "Let me tell you something about France.",
            "The capital of France is",
+            "The population of the city is",
            "A brief history about that city is",
-            "To plan a travel, the budget is",
        ]
        tokenizer = get_tokenizer(self.model)
        chunks_ids = [tokenizer.encode(x) for x in chunks]
+        for i in range(1, len(chunks_ids)):
+            if chunks_ids[i][0] == tokenizer.bos_token_id:
+                chunks_ids[i] = chunks_ids[i][1:]

        # 1. using session control
        session_id = requests.post(
@@ -48,6 +60,13 @@ class TestSessionControl(unittest.TestCase):
        ).json()
        rid = None

+        # open an existing session, should get session_id as None
+        response = requests.post(
+            self.base_url + "/open_session",
+            json={"capacity_of_str_len": 1000, "session_id": session_id},
+        ).json()
+        assert isinstance(response, dict) and "error" in response
+
        first_rid = None
        outputs_from_session = []
        for i, chunk_ids in enumerate(chunks_ids):
@@ -55,11 +74,16 @@ class TestSessionControl(unittest.TestCase):
                self.base_url + "/generate",
                json={
                    "input_ids": chunk_ids,
-                    "session": [session_id, rid],
+                    "session_params": {
+                        "id": session_id,
+                        "rid": rid,
+                        "offset": -1,
+                        "replace": True,
+                    },
                    "sampling_params": {
                        "temperature": 0,
                        "max_new_tokens": (
-                            16 if i > 0 else 0
+                            gen_len if i > 0 else 1
                        ),  # prefill only for the first chunk
                        "no_stop_trim": True,
                        "skip_special_tokens": False,
@@ -77,10 +101,15 @@ class TestSessionControl(unittest.TestCase):
            self.base_url + "/generate",
            json={
                "input_ids": chunks_ids[-1],
-                "session": [session_id, first_rid],
+                "session_params": {
+                    "id": session_id,
+                    "rid": first_rid,
+                    "offset": -1,
+                    "replace": True,
+                },
                "sampling_params": {
                    "temperature": 0,
-                    "max_new_tokens": 16,
+                    "max_new_tokens": gen_len,
                    "no_stop_trim": True,
                    "skip_special_tokens": False,
                },
@@ -93,10 +122,15 @@ class TestSessionControl(unittest.TestCase):
            self.base_url + "/generate",
            json={
                "input_ids": chunks_ids[-1],
-                "session": [session_id, rid],
+                "session_params": {
+                    "id": session_id,
+                    "rid": rid,
+                    "offset": -1,
+                    "replace": True,
+                },
                "sampling_params": {
                    "temperature": 0,
-                    "max_new_tokens": 16,
+                    "max_new_tokens": gen_len,
                    "no_stop_trim": True,
                    "skip_special_tokens": False,
                },
@@ -115,10 +149,15 @@ class TestSessionControl(unittest.TestCase):
            self.base_url + "/generate",
            json={
                "input_ids": chunks_ids[-1],
-                "session": [session_id, first_rid],
+                "session_params": {
+                    "id": session_id,
+                    "rid": first_rid,
+                    "offset": -1,
+                    "replace": True,
+                },
                "sampling_params": {
                    "temperature": 0,
-                    "max_new_tokens": 16,
+                    "max_new_tokens": gen_len,
                    "no_stop_trim": True,
                    "skip_special_tokens": False,
                },
@@ -127,6 +166,8 @@ class TestSessionControl(unittest.TestCase):
        assert response["meta_info"]["finish_reason"]["type"] == "abort"

        # 2. not use session control
+        requests.post(self.base_url + "/flush_cache")
+
        input_ids_first_req = None
        input_ids = []
        outputs_normal = []
@@ -139,7 +180,7 @@ class TestSessionControl(unittest.TestCase):
                    "sampling_params": {
                        "temperature": 0,
                        "max_new_tokens": (
-                            16 if i > 0 else 0
+                            gen_len if i > 0 else 1
                        ),  # prefill only for the first chunk
                        "no_stop_trim": True,
                        "skip_special_tokens": False,
@@ -150,7 +191,7 @@ class TestSessionControl(unittest.TestCase):
                output_ids = tokenizer.encode(response["text"])
                if output_ids[0] == tokenizer.bos_token_id:
                    output_ids = output_ids[1:]
-                input_ids += output_ids
+                input_ids += output_ids[:-1]
                outputs_normal.append(response["text"])
            if i == 0:
                input_ids_first_req = input_ids.copy()
@@ -162,7 +203,7 @@ class TestSessionControl(unittest.TestCase):
                "input_ids": input_ids_first_req,
                "sampling_params": {
                    "temperature": 0,
-                    "max_new_tokens": 16,
+                    "max_new_tokens": gen_len,
                    "no_stop_trim": True,
                    "skip_special_tokens": False,
                },
@@ -176,6 +217,272 @@ class TestSessionControl(unittest.TestCase):
        print(outputs_normal)
        assert outputs_from_session == outputs_normal

+    async def async_generate(self, payload):
+        url = self.base_url + "/generate"
+        async with aiohttp.ClientSession() as session:
+            async with session.post(url=url, json=payload) as response:
+                assert response.status == 200
+                async for chunk_bytes in response.content:
+                    chunk_bytes = chunk_bytes.strip()
+                    if not chunk_bytes:
+                        continue
+                    chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                    if chunk == "[DONE]":
+                        yield "", None, ""
+                    else:
+                        data = json.loads(chunk)
+                        finish_reason = (
+                            data["meta_info"]["finish_reason"]["type"]
+                            if data["meta_info"]["finish_reason"]
+                            else ""
+                        )
+                        yield data["text"], data["meta_info"]["id"], finish_reason
+
+    async def run_session_control_backtrack_with_abort(self, replace):
+        chunks = [
+            "Let me tell you something about France.",
+            "The capital of France is",
+        ]
+        tokenizer = get_tokenizer(self.model)
+        chunks_ids = [tokenizer.encode(x) for x in chunks]
+        for i in range(1, len(chunks_ids)):
+            if chunks_ids[i][0] == tokenizer.bos_token_id:
+                chunks_ids[i] = chunks_ids[i][1:]
+
+        # 1. using session control
+        session_id = requests.post(
+            self.base_url + "/open_session",
+            json={"capacity_of_str_len": 1000},
+        ).json()
+        rid = None
+
+        payload = {
+            "input_ids": chunks_ids[0],
+            "session_params": {
+                "id": session_id,
+                "rid": rid,
+                "offset": -1,
+                "replace": True,
+            },
+            "sampling_params": {
+                "temperature": 0,
+                "max_new_tokens": 100,
+                "no_stop_trim": True,
+                "skip_special_tokens": False,
+                "ignore_eos": True,
+            },
+            "stream": True,
+        }
+        gen_so_far = ""
+        finish_reason = ""
+        second_output = ""
+        async for chunk, rid, finish_reason_chunk in self.async_generate(payload):
+            gen_so_far += chunk
+            if finish_reason == "":
+                finish_reason = finish_reason_chunk
+            if len(gen_so_far) > 50 and second_output == "":
+                payload2 = {
+                    "input_ids": chunks_ids[1],
+                    "session_params": {
+                        "id": session_id,
+                        "rid": rid,
+                        "offset": 50,
+                        "replace": replace,
+                    },
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": 32,
+                        "no_stop_trim": True,
+                        "skip_special_tokens": False,
+                    },
+                    "stream": False,
+                    "stream_output": True,
+                }
+                response = requests.post(
+                    url=self.base_url + "/generate", json=payload2
+                ).json()
+                second_output = response["text"]
+        if replace:
+            assert finish_reason == "abort"
+        print("first request output:")
+        print(gen_so_far)
+        print("second request output:")
+        print(second_output)
+
+        # close the session
+        ret = requests.post(
+            self.base_url + "/close_session",
+            json={"session_id": session_id},
+        )
+        assert ret.status_code == 200
+
+        if not replace:
+            assert response["meta_info"]["finish_reason"]["type"] == "abort"
+        else:
+            # 2. not using session control
+            output_ids = tokenizer.encode(gen_so_far)
+            if output_ids[0] == tokenizer.bos_token_id:
+                output_ids = output_ids[1:]
+            input_ids = chunks_ids[0] + output_ids
+            input_ids = input_ids[:50] + chunks_ids[1]
+            payload = {
+                "input_ids": input_ids,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 32,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+                "stream": False,
+                "stream_output": True,
+            }
+            response = requests.post(
+                url=self.base_url + "/generate", json=payload
+            ).json()
+            output_no_session = response["text"]
+            print("second request output without session:")
+            print(output_no_session)
+            assert second_output == output_no_session
+
+    def test_session_control_backtrack_with_abort(self):
+        asyncio.run(self.run_session_control_backtrack_with_abort(replace=True))
+        asyncio.run(self.run_session_control_backtrack_with_abort(replace=False))
+
+    def run_session_control_with_branching(
+        self, root_prompt, chunks_per_step, gen_len=16
+    ):
+        for x in chunks_per_step:
+            assert len(x) == len(chunks_per_step[0])
+
+        # 1. using session control
+        session_id = requests.post(
+            self.base_url + "/open_session",
+            json={"capacity_of_str_len": 1000},
+        ).json()
+
+        outputs_from_session = []
+        # send the root prompt
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": root_prompt,
+                "session_params": {
+                    "id": session_id,
+                    "rid": None,
+                    "offset": 0,
+                    "replace": False,
+                },
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": gen_len,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+            },
+        ).json()
+        rid_per_branch = [response["meta_info"]["id"]] * len(chunks_per_step[0])
+        outputs_from_session.append(response["text"])
+
+        # send the prompts in branches
+        for chunks_for_branches in chunks_per_step:
+            for j, chunk in enumerate(chunks_for_branches):
+                response = requests.post(
+                    self.base_url + "/generate",
+                    json={
+                        "text": chunk,
+                        "session_params": {
+                            "id": session_id,
+                            "rid": rid_per_branch[j],
+                            "offset": 0,
+                            "replace": False,
+                        },
+                        "sampling_params": {
+                            "temperature": 0,
+                            "max_new_tokens": gen_len,
+                            "no_stop_trim": True,
+                            "skip_special_tokens": False,
+                        },
+                    },
+                ).json()
+                rid = response["meta_info"]["id"]
+                rid_per_branch[j] = rid
+                outputs_from_session.append(response["text"])
+
+        # close the session
+        ret = requests.post(
+            self.base_url + "/close_session",
+            json={"session_id": session_id},
+        )
+        assert ret.status_code == 200
+
+        # 2. not use session control
+        requests.post(self.base_url + "/flush_cache")
+
+        outputs_normal = []
+        input_texts = [root_prompt] * len(chunks_per_step[0])
+        # send the root prompt
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": root_prompt,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": gen_len,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+            },
+        ).json()
+        outputs_normal.append(response["text"])
+        input_texts = [x + response["text"] for x in input_texts]
+
+        # send the prompts in branches
+        for chunks_for_branches in chunks_per_step:
+            for j, chunk in enumerate(chunks_for_branches):
+                input_texts[j] += chunk
+                response = requests.post(
+                    self.base_url + "/generate",
+                    json={
+                        "text": input_texts[j],
+                        "sampling_params": {
+                            "temperature": 0,
+                            "max_new_tokens": gen_len,
+                            "no_stop_trim": True,
+                            "skip_special_tokens": False,
+                        },
+                    },
+                ).json()
+                outputs_normal.append(response["text"])
+                input_texts[j] += response["text"]
+
+        print("====== outputs from chunked queries with session control: =======")
+        print(outputs_from_session)
+        print("====== outputs from normal queries: =======")
+        print(outputs_normal)
+        assert outputs_from_session == outputs_normal
+
+    def test_session_control_with_branching(self):
+        root_prompt = "First, let me explain in one sentence about AI"
+        chunks_per_step = [
+            [
+                "Then, briefly, the positive side of AI is",
+                "But, briefly, AI could be harmful to human",
+            ],
+            ["For example", "For example"],
+        ]
+        self.run_session_control_with_branching(
+            root_prompt=root_prompt, chunks_per_step=chunks_per_step, gen_len=8
+        )
+
+        root_prompt = "I have three apples."
+        chunks_per_step = [
+            ["I then give one apple to my friend", "My friend give me another apple."],
+            ["I still have", "I now have"],
+        ]
+        self.run_session_control_with_branching(
+            root_prompt=root_prompt, chunks_per_step=chunks_per_step, gen_len=8
+        )
+

 class TestSessionControlVision(unittest.TestCase):
    @classmethod
@@ -197,17 +504,25 @@ class TestSessionControlVision(unittest.TestCase):
        text_chunks = [
            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n",
            "<|im_start|>user\n<image>\nDescribe this image in a very short sentence.<|im_end|>\n<|im_start|>assistant\n",
-            "<|im_start|>user\n<image>\nIs this image same with the previous image? Answer yes or no.<|im_end|>\n<|im_start|>assistant\n",
-            "<|im_start|>user\n<image>\nIs this image same with the previous image? Answer yes or no.<|im_end|>\n<|im_start|>assistant\n",
+            "<|im_start|>user\n<image>\nIs this image same with one of the previous images?<|im_end|>\n<|im_start|>assistant\n",
+            "<|im_start|>user\n<image>\nIs this image same with one of the previous images?<|im_end|>\n<|im_start|>assistant\n",
+            "<|im_start|>user\nDescribe this image in a very short sentence.<|im_end|>\nassistant:",
        ]
        image_chunks = [
+            "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
+            "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
            "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png",
-            "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
-            "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
        ]
-        assert len(text_chunks) == len(image_chunks) + 1
+
+        assert (
+            len(text_chunks) == len(image_chunks) + 2
+        )  # the first and the last prompt does not contain images
        tokenizer = get_tokenizer(self.model)
        text_input_ids = [tokenizer.encode(x) for x in text_chunks]
+        for i in range(1, len(text_input_ids)):
+            if text_input_ids[i][0] == tokenizer.bos_token_id:
+                text_input_ids[i] = text_input_ids[i][1:]
+        gen_len = 32

        # 1. using session control
        session_id = requests.post(
@@ -216,20 +531,32 @@ class TestSessionControlVision(unittest.TestCase):
        ).json()
        rid = None

+        # open an existing session, should get session_id as None
+        response = requests.post(
+            self.base_url + "/open_session",
+            json={"capacity_of_str_len": 1000, "session_id": session_id},
+        ).json()
+        assert isinstance(response, dict) and "error" in response
+
        first_rid = None
        outputs_from_session = []
-        for i in range(len(text_input_ids)):
+        for i in range(len(text_input_ids[:-1])):
            response = requests.post(
                self.base_url + "/generate",
                json={
                    "input_ids": text_input_ids[i],
                    "image_data": image_chunks[i - 1] if i > 0 else None,
                    "modalities": ["multi-images"],
-                    "session": [session_id, rid],
+                    "session_params": {
+                        "id": session_id,
+                        "rid": rid,
+                        "offset": 0,
+                        "replace": True,
+                    },
                    "sampling_params": {
                        "temperature": 0,
                        "max_new_tokens": (
-                            16 if i > 0 else 0
+                            gen_len if i > 0 else 0
                        ),  # prefill only for the first chunk
                        "no_stop_trim": True,
                        "skip_special_tokens": False,
@@ -247,12 +574,15 @@ class TestSessionControlVision(unittest.TestCase):
            self.base_url + "/generate",
            json={
                "input_ids": text_input_ids[-1],
-                "image_data": image_chunks[-1:],
-                "modalities": ["multi-images"],
-                "session": [session_id, first_rid],
+                "session_params": {
+                    "id": session_id,
+                    "rid": first_rid,
+                    "offset": 0,
+                    "replace": True,
+                },
                "sampling_params": {
                    "temperature": 0,
-                    "max_new_tokens": 16,
+                    "max_new_tokens": gen_len,
                    "no_stop_trim": True,
                    "skip_special_tokens": False,
                },
@@ -265,12 +595,15 @@ class TestSessionControlVision(unittest.TestCase):
            self.base_url + "/generate",
            json={
                "input_ids": text_input_ids[-1],
-                "image_data": image_chunks[-1:],
-                "modalities": ["multi-images"],
-                "session": [session_id, rid],
+                "session_params": {
+                    "id": session_id,
+                    "rid": rid,
+                    "offset": 0,
+                    "replace": True,
+                },
                "sampling_params": {
                    "temperature": 0,
-                    "max_new_tokens": 16,
+                    "max_new_tokens": gen_len,
                    "no_stop_trim": True,
                    "skip_special_tokens": False,
                },
@@ -289,10 +622,15 @@ class TestSessionControlVision(unittest.TestCase):
            self.base_url + "/generate",
            json={
                "input_ids": text_input_ids[-1],
-                "session": [session_id, first_rid],
+                "session_params": {
+                    "id": session_id,
+                    "rid": first_rid,
+                    "offset": 0,
+                    "replace": True,
+                },
                "sampling_params": {
                    "temperature": 0,
-                    "max_new_tokens": 16,
+                    "max_new_tokens": gen_len,
                    "no_stop_trim": True,
                    "skip_special_tokens": False,
                },
@@ -306,7 +644,7 @@ class TestSessionControlVision(unittest.TestCase):
        input_ids_first_req = None
        input_ids = []
        outputs_normal = []
-        for i in range(len(text_input_ids)):
+        for i in range(len(text_input_ids[:-1])):
            input_ids += text_input_ids[i]
            image_data = image_chunks[:i] if i > 0 else None
            response = requests.post(
@@ -318,7 +656,7 @@ class TestSessionControlVision(unittest.TestCase):
                    "sampling_params": {
                        "temperature": 0,
                        "max_new_tokens": (
-                            16 if i > 0 else 0
+                            gen_len if i > 0 else 0
                        ),  # prefill only for the first chunk
                        "no_stop_trim": True,
                        "skip_special_tokens": False,
@@ -339,11 +677,9 @@ class TestSessionControlVision(unittest.TestCase):
            self.base_url + "/generate",
            json={
                "input_ids": input_ids_first_req,
-                "image_data": image_chunks[-1:],
-                "modalities": ["multi-images"],
                "sampling_params": {
                    "temperature": 0,
-                    "max_new_tokens": 16,
+                    "max_new_tokens": gen_len,
                    "no_stop_trim": True,
                    "skip_special_tokens": False,
                },