Test openai vision api (#925)

2024-08-04 20:51:55 -07:00
parent ebf69964cd
commit 3bc99e6fe4
13 changed files with 102 additions and 228 deletions
--- a/README.md
+++ b/README.md
@@ -136,7 +136,7 @@ response = client.chat.completions.create(
 print(response)
 ```

-It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
+It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).

 ### Additional Server Arguments
 - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -390,8 +390,13 @@ class TokenizerManager:
                obj.return_text_in_logprobs,
            )

+            # Log requests
            if self.server_args.log_requests and state.finished:
-                logger.info(f"in={obj.text}, out={out}")
+                if obj.text is None:
+                    in_obj = {"text": self.tokenizer.decode(obj.input_ids)}
+                else:
+                    in_obj = {"text": obj.text}
+                logger.info(f"in={in_obj}, out={out}")

            state.out_list = []
            if state.finished:
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -18,7 +18,7 @@ from sglang.lang.backend.openai import OpenAI
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.utils import get_exception_traceback

-MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"


 def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
--- a/scripts/deprecated/test_openai_server.py
+++ b/scripts/deprecated/test_openai_server.py
@@ -1,209 +0,0 @@
-"""
-First run the following command to launch the server.
-Note that TinyLlama adopts different chat templates in different versions.
-For v0.4, the chat template is chatml.
-
-python3 -m sglang.launch_server --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 \
--port 30000 --chat-template chatml
-
-Output example:
-The capital of France is Paris.
-The capital of the United States is Washington, D.C.
-The capital of Canada is Ottawa.
-The capital of Japan is Tokyo
-"""
-
-import argparse
-import json
-
-import openai
-
-
-def test_completion(args, echo, logprobs):
-    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
-    response = client.completions.create(
-        model="default",
-        prompt="The capital of France is",
-        temperature=0,
-        max_tokens=32,
-        echo=echo,
-        logprobs=logprobs,
-    )
-    text = response.choices[0].text
-    print(response.choices[0].text)
-    if echo:
-        assert text.startswith("The capital of France is")
-    if logprobs:
-        print(response.choices[0].logprobs.top_logprobs)
-        assert response.choices[0].logprobs
-        if echo:
-            assert response.choices[0].logprobs.token_logprobs[0] == None
-        else:
-            assert response.choices[0].logprobs.token_logprobs[0] != None
-    assert response.id
-    assert response.created
-    assert response.usage.prompt_tokens > 0
-    assert response.usage.completion_tokens > 0
-    assert response.usage.total_tokens > 0
-    print("=" * 100)
-
-
-def test_completion_stream(args, echo, logprobs):
-    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
-    response = client.completions.create(
-        model="default",
-        prompt="The capital of France is",
-        temperature=0,
-        max_tokens=32,
-        stream=True,
-        echo=echo,
-        logprobs=logprobs,
-    )
-    first = True
-    for r in response:
-        if first:
-            if echo:
-                assert r.choices[0].text.startswith("The capital of France is")
-            first = False
-        if logprobs:
-            print(
-                f"{r.choices[0].text:12s}\t" f"{r.choices[0].logprobs.token_logprobs}",
-                flush=True,
-            )
-            print(r.choices[0].logprobs.top_logprobs)
-        else:
-            print(r.choices[0].text, end="", flush=True)
-        assert r.id
-        assert r.usage.prompt_tokens > 0
-        assert r.usage.completion_tokens > 0
-        assert r.usage.total_tokens > 0
-    print("=" * 100)
-
-
-def test_chat_completion(args):
-    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
-    response = client.chat.completions.create(
-        model="default",
-        messages=[
-            {"role": "system", "content": "You are a helpful AI assistant"},
-            {"role": "user", "content": "What is the capital of France?"},
-        ],
-        temperature=0,
-        max_tokens=32,
-    )
-    print(response.choices[0].message.content)
-    assert response.id
-    assert response.created
-    assert response.usage.prompt_tokens > 0
-    assert response.usage.completion_tokens > 0
-    assert response.usage.total_tokens > 0
-    print("=" * 100)
-
-
-def test_chat_completion_image(args):
-    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
-    response = client.chat.completions.create(
-        model="default",
-        messages=[
-            {"role": "system", "content": "You are a helpful AI assistant"},
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "Describe this image"},
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/mixtral_8x7b.jpg"
-                        },
-                    },
-                ],
-            },
-        ],
-        temperature=0,
-        max_tokens=32,
-    )
-    print(response.choices[0].message.content)
-    assert response.id
-    assert response.created
-    assert response.usage.prompt_tokens > 0
-    assert response.usage.completion_tokens > 0
-    assert response.usage.total_tokens > 0
-    print("=" * 100)
-
-
-def test_chat_completion_stream(args):
-    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
-    response = client.chat.completions.create(
-        model="default",
-        messages=[
-            {"role": "system", "content": "You are a helpful AI assistant"},
-            {"role": "user", "content": "List 3 countries and their capitals."},
-        ],
-        temperature=0,
-        max_tokens=64,
-        stream=True,
-    )
-    is_first = True
-    for chunk in response:
-        if is_first:
-            is_first = False
-            assert chunk.choices[0].delta.role == "assistant"
-            continue
-
-        data = chunk.choices[0].delta
-        if not data.content:
-            continue
-        print(data.content, end="", flush=True)
-    print("=" * 100)
-
-
-def test_regex(args):
-    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
-
-    regex = (
-        r"""\{\n"""
-        + r"""   "name": "[\w]+",\n"""
-        + r"""   "population": [\d]+\n"""
-        + r"""\}"""
-    )
-
-    response = client.chat.completions.create(
-        model="default",
-        messages=[
-            {"role": "system", "content": "You are a helpful AI assistant"},
-            {"role": "user", "content": "Introduce the capital of France."},
-        ],
-        temperature=0,
-        max_tokens=128,
-        extra_body={"regex": regex},
-    )
-    text = response.choices[0].message.content
-    print(json.loads(text))
-    print("=" * 100)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--base-url", type=str, default="http://127.0.0.1:30000/v1")
-    parser.add_argument(
-        "--test-image", action="store_true", help="Enables testing image inputs"
-    )
-    args = parser.parse_args()
-
-    test_completion(args, echo=False, logprobs=False)
-    test_completion(args, echo=True, logprobs=False)
-    test_completion(args, echo=False, logprobs=True)
-    test_completion(args, echo=True, logprobs=True)
-    test_completion(args, echo=False, logprobs=3)
-    test_completion(args, echo=True, logprobs=3)
-    test_completion_stream(args, echo=False, logprobs=False)
-    test_completion_stream(args, echo=True, logprobs=False)
-    test_completion_stream(args, echo=False, logprobs=True)
-    test_completion_stream(args, echo=True, logprobs=True)
-    test_completion_stream(args, echo=False, logprobs=3)
-    test_completion_stream(args, echo=True, logprobs=3)
-    test_chat_completion(args)
-    test_chat_completion_stream(args)
-    test_regex(args)
-    if args.test_image:
-        test_chat_completion_image(args)
--- a/test/lang/test_bind_cache.py
+++ b/test/lang/test_bind_cache.py
@@ -1,7 +1,7 @@
 import unittest

 import sglang as sgl
-from sglang.test.test_utils import MODEL_NAME_FOR_TEST
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST


 class TestBind(unittest.TestCase):
@@ -9,7 +9,7 @@ class TestBind(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
-        cls.backend = sgl.Runtime(model_path=MODEL_NAME_FOR_TEST)
+        cls.backend = sgl.Runtime(model_path=DEFAULT_MODEL_NAME_FOR_TEST)
        sgl.set_default_backend(cls.backend)

    @classmethod
--- a/test/lang/test_srt_backend.py
+++ b/test/lang/test_srt_backend.py
@@ -14,7 +14,7 @@ from sglang.test.test_programs import (
    test_stream,
    test_tool_use,
 )
-from sglang.test.test_utils import MODEL_NAME_FOR_TEST
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST


 class TestSRTBackend(unittest.TestCase):
@@ -22,7 +22,7 @@ class TestSRTBackend(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
-        cls.backend = sgl.Runtime(model_path=MODEL_NAME_FOR_TEST)
+        cls.backend = sgl.Runtime(model_path=DEFAULT_MODEL_NAME_FOR_TEST)
        sgl.set_default_backend(cls.backend)

    @classmethod
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -5,8 +5,9 @@ from sglang.test.test_utils import run_unittest_files

 suites = {
    "minimal": [
-        "test_openai_server.py",
        "test_eval_accuracy.py",
+        "test_openai_server.py",
+        "test_vision_openai_server.py",
        "test_chunked_prefill.py",
        "test_torch_compile.py",
        "models/test_causal_models.py",
--- a/test/srt/test_chunked_prefill.py
+++ b/test/srt/test_chunked_prefill.py
@@ -3,14 +3,14 @@ from types import SimpleNamespace

 from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server


 class TestAccuracy(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
-        cls.model = MODEL_NAME_FOR_TEST
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
        cls.base_url = f"http://localhost:8157"
        cls.process = popen_launch_server(
            cls.model,
--- a/test/srt/test_eval_accuracy.py
+++ b/test/srt/test_eval_accuracy.py
@@ -3,14 +3,14 @@ from types import SimpleNamespace

 from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server


 class TestAccuracy(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
-        cls.model = MODEL_NAME_FOR_TEST
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
        cls.base_url = f"http://localhost:8157"
        cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)

--- a/test/srt/test_openai_server.py
+++ b/test/srt/test_openai_server.py
@@ -5,21 +5,21 @@ import openai

 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server


 class TestOpenAIServer(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
-        cls.model = MODEL_NAME_FOR_TEST
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
        cls.base_url = f"http://localhost:8157"
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model, cls.base_url, timeout=300, api_key=cls.api_key
        )
        cls.base_url += "/v1"
-        cls.tokenizer = get_tokenizer(MODEL_NAME_FOR_TEST)
+        cls.tokenizer = get_tokenizer(DEFAULT_MODEL_NAME_FOR_TEST)

    @classmethod
    def tearDownClass(cls):
@@ -147,6 +147,7 @@ class TestOpenAIServer(unittest.TestCase):
            top_logprobs=logprobs,
            n=parallel_sample_num,
        )
+
        if logprobs:
            assert isinstance(
                response.choices[0].logprobs.content[0].top_logprobs[0].token, str
@@ -158,6 +159,7 @@ class TestOpenAIServer(unittest.TestCase):
            assert (
                ret_num_top_logprobs == logprobs
            ), f"{ret_num_top_logprobs} vs {logprobs}"
+
        assert len(response.choices) == parallel_sample_num
        assert response.choices[0].message.role == "assistant"
        assert isinstance(response.choices[0].message.content, str)
--- a/test/srt/test_srt_endpoint.py
+++ b/test/srt/test_srt_endpoint.py
@@ -5,14 +5,14 @@ import requests

 from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server


 class TestSRTEndpoint(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
-        cls.model = MODEL_NAME_FOR_TEST
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
        cls.base_url = f"http://localhost:{8157}"
        cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)

--- a/test/srt/test_torch_compile.py
+++ b/test/srt/test_torch_compile.py
@@ -3,14 +3,14 @@ from types import SimpleNamespace

 from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server


 class TestAccuracy(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
-        cls.model = MODEL_NAME_FOR_TEST
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
        cls.base_url = f"http://localhost:8157"
        cls.process = popen_launch_server(
            cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"]
--- a/test/srt/test_vision_openai_server.py
+++ b/test/srt/test_vision_openai_server.py
@@ -0,0 +1,75 @@
+import json
+import unittest
+
+import openai
+
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils import kill_child_process
+from sglang.test.test_utils import popen_launch_server
+
+
+class TestOpenAIVisionServer(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "liuhaotian/llava-v1.6-vicuna-7b"
+        cls.base_url = "http://localhost:8157"
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=300,
+            api_key=cls.api_key,
+            other_args=[
+                "--chat-template",
+                "vicuna_v1.1",
+                "--tokenizer-path",
+                "llava-hf/llava-1.5-7b-hf",
+                "--log-requests",
+            ],
+        )
+        cls.base_url += "/v1"
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_child_process(cls.process.pid)
+
+    def test_chat_completion(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        response = client.chat.completions.create(
+            model="default",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": "https://github.com/sgl-project/sglang/blob/main/assets/logo.png?raw=true"
+                            },
+                        },
+                        {"type": "text", "text": "Describe this image"},
+                    ],
+                },
+            ],
+            temperature=0,
+            max_tokens=32,
+        )
+
+        assert response.choices[0].message.role == "assistant"
+        assert isinstance(response.choices[0].message.content, str)
+        assert response.id
+        assert response.created
+        assert response.usage.prompt_tokens > 0
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens > 0
+
+
+if __name__ == "__main__":
+    unittest.main(warnings="ignore")
+
+    # t = TestOpenAIVisionServer()
+    # t.setUpClass()
+    # t.test_chat_completion()
+    # t.tearDownClass()