[feat] Support session control for vision language models (#2210)

2024-11-27 00:03:29 -08:00
parent c754652fcd
commit 37c8a5761f
7 changed files with 265 additions and 21 deletions
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -36,6 +36,7 @@ suites = {
        "test_triton_attention_backend.py",
        "test_update_weights.py",
        "test_vision_openai_server.py",
+        "test_session_control.py",
    ],
    "sampling/penaltylib": glob.glob(
        "sampling/penaltylib/**/test_*.py", recursive=True
--- a/test/srt/test_session_control.py
+++ b/test/srt/test_session_control.py
@@ -1,7 +1,7 @@
 """
 Usage:
 python3 -m unittest test_session_control.TestSessionControl.test_session_control
-python3 -m unittest test_session_control.TestSessionControl.test_session_control_vlm
+python3 -m unittest test_session_control.TestSessionControlVision.test_session_control
 """

 import unittest
@@ -61,6 +61,8 @@ class TestSessionControl(unittest.TestCase):
                        "max_new_tokens": (
                            16 if i > 0 else 0
                        ),  # prefill only for the first chunk
+                        "no_stop_trim": True,
+                        "skip_special_tokens": False,
                    },
                },
            ).json()
@@ -79,6 +81,8 @@ class TestSessionControl(unittest.TestCase):
                "sampling_params": {
                    "temperature": 0,
                    "max_new_tokens": 16,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
                },
            },
        ).json()
@@ -93,6 +97,8 @@ class TestSessionControl(unittest.TestCase):
                "sampling_params": {
                    "temperature": 0,
                    "max_new_tokens": 16,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
                },
            },
        ).json()
@@ -113,6 +119,8 @@ class TestSessionControl(unittest.TestCase):
                "sampling_params": {
                    "temperature": 0,
                    "max_new_tokens": 16,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
                },
            },
        ).json()
@@ -133,13 +141,16 @@ class TestSessionControl(unittest.TestCase):
                        "max_new_tokens": (
                            16 if i > 0 else 0
                        ),  # prefill only for the first chunk
+                        "no_stop_trim": True,
+                        "skip_special_tokens": False,
                    },
                },
            ).json()
            if i > 0:
-                input_ids += tokenizer.encode(response["text"])[
-                    1:
-                ]  # drop the bos token
+                output_ids = tokenizer.encode(response["text"])
+                if output_ids[0] == tokenizer.bos_token_id:
+                    output_ids = output_ids[1:]
+                input_ids += output_ids
                outputs_normal.append(response["text"])
            if i == 0:
                input_ids_first_req = input_ids.copy()
@@ -152,6 +163,187 @@ class TestSessionControl(unittest.TestCase):
                "sampling_params": {
                    "temperature": 0,
                    "max_new_tokens": 16,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+            },
+        ).json()
+        outputs_normal.append(response["text"])
+
+        print("outputs from chunked queries with session control:")
+        print(outputs_from_session)
+        print("outputs from normal queries:")
+        print(outputs_normal)
+        assert outputs_from_session == outputs_normal
+
+
+class TestSessionControlVision(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "lmms-lab/llava-onevision-qwen2-7b-ov"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            # other_args={"--disable-radix"},
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_child_process(cls.process.pid, include_self=True)
+
+    def test_session_control(self):
+        text_chunks = [
+            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n",
+            "<|im_start|>user\n<image>\nDescribe this image in a very short sentence.<|im_end|>\n<|im_start|>assistant\n",
+            "<|im_start|>user\n<image>\nIs this image same with the previous image? Answer yes or no.<|im_end|>\n<|im_start|>assistant\n",
+            "<|im_start|>user\n<image>\nIs this image same with the previous image? Answer yes or no.<|im_end|>\n<|im_start|>assistant\n",
+        ]
+        image_chunks = [
+            "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png",
+            "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
+            "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
+        ]
+        assert len(text_chunks) == len(image_chunks) + 1
+        tokenizer = get_tokenizer(self.model)
+        text_input_ids = [tokenizer.encode(x) for x in text_chunks]
+
+        # 1. using session control
+        session_id = requests.post(
+            self.base_url + "/open_session",
+            json={"capacity_of_str_len": 1000},
+        ).json()
+        rid = None
+
+        first_rid = None
+        outputs_from_session = []
+        for i in range(len(text_input_ids)):
+            response = requests.post(
+                self.base_url + "/generate",
+                json={
+                    "input_ids": text_input_ids[i],
+                    "image_data": image_chunks[i - 1] if i > 0 else None,
+                    "modalities": ["multi-images"],
+                    "session": [session_id, rid],
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": (
+                            16 if i > 0 else 0
+                        ),  # prefill only for the first chunk
+                        "no_stop_trim": True,
+                        "skip_special_tokens": False,
+                    },
+                },
+            ).json()
+            rid = response["meta_info"]["id"]
+            if i == 0:
+                first_rid = rid
+            if i > 0:
+                outputs_from_session.append(response["text"])
+
+        # backtrack to the first request and regenerate
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "input_ids": text_input_ids[-1],
+                "image_data": image_chunks[-1:],
+                "modalities": ["multi-images"],
+                "session": [session_id, first_rid],
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 16,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+            },
+        ).json()
+        outputs_from_session.append(response["text"])
+
+        # query with a non-existing rid (the last one should be disappeared becuase of backtrack), should see abort
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "input_ids": text_input_ids[-1],
+                "image_data": image_chunks[-1:],
+                "modalities": ["multi-images"],
+                "session": [session_id, rid],
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 16,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+            },
+        ).json()
+        assert response["meta_info"]["finish_reason"]["type"] == "abort"
+
+        ret = requests.post(
+            self.base_url + "/close_session",
+            json={"session_id": session_id},
+        )
+        assert ret.status_code == 200
+
+        # send a request to a closed session, should see abort
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "input_ids": text_input_ids[-1],
+                "session": [session_id, first_rid],
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 16,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+            },
+        ).json()
+        assert response["meta_info"]["finish_reason"]["type"] == "abort"
+
+        # 2. not use session control
+        input_ids_first_req = None
+        input_ids = []
+        outputs_normal = []
+        for i in range(len(text_input_ids)):
+            input_ids += text_input_ids[i]
+            image_data = image_chunks[:i] if i > 0 else None
+            response = requests.post(
+                self.base_url + "/generate",
+                json={
+                    "input_ids": input_ids,
+                    "image_data": image_data,
+                    "modalities": ["multi-images"],
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": (
+                            16 if i > 0 else 0
+                        ),  # prefill only for the first chunk
+                        "no_stop_trim": True,
+                        "skip_special_tokens": False,
+                    },
+                },
+            ).json()
+            if i > 0:
+                output_ids = tokenizer.encode(response["text"])
+                if output_ids[0] == tokenizer.bos_token_id:
+                    output_ids = output_ids[1:]
+                input_ids += output_ids
+                outputs_normal.append(response["text"])
+            if i == 0:
+                input_ids_first_req = input_ids.copy()
+
+        input_ids_first_req += text_input_ids[-1]
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "input_ids": input_ids_first_req,
+                "image_data": image_chunks[-1:],
+                "modalities": ["multi-images"],
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 16,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
                },
            },
        ).json()