Test openai vision api (#925)

2024-08-04 20:51:55 -07:00
parent ebf69964cd
commit 3bc99e6fe4
13 changed files with 102 additions and 228 deletions
--- a/README.md
+++ b/README.md
@@ -136,7 +136,7 @@ response = client.chat.completions.create(
 print(response)
 ```
-It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
+It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
 ### Additional Server Arguments
 - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -390,8 +390,13 @@ class TokenizerManager:
                obj.return_text_in_logprobs,
            )
            # Log requests
            if self.server_args.log_requests and state.finished:
-                logger.info(f"in={obj.text}, out={out}")
+                if obj.text is None:
                    in_obj = {"text": self.tokenizer.decode(obj.input_ids)}
                else:
                    in_obj = {"text": obj.text}
                logger.info(f"in={in_obj}, out={out}")
            state.out_list = []
            if state.finished:
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -18,7 +18,7 @@ from sglang.lang.backend.openai import OpenAI
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.utils import get_exception_traceback
-MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
--- a/scripts/deprecated/test_openai_server.py
+++ b/scripts/deprecated/test_openai_server.py
@@ -1,209 +0,0 @@
 """
 First run the following command to launch the server.
 Note that TinyLlama adopts different chat templates in different versions.
 For v0.4, the chat template is chatml.
 python3 -m sglang.launch_server --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 \
 --port 30000 --chat-template chatml
 Output example:
 The capital of France is Paris.
 The capital of the United States is Washington, D.C.
 The capital of Canada is Ottawa.
 The capital of Japan is Tokyo
 """
 import argparse
 import json
 import openai
 def test_completion(args, echo, logprobs):
    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
    response = client.completions.create(
        model="default",
        prompt="The capital of France is",
        temperature=0,
        max_tokens=32,
        echo=echo,
        logprobs=logprobs,
    )
    text = response.choices[0].text
    print(response.choices[0].text)
    if echo:
        assert text.startswith("The capital of France is")
    if logprobs:
        print(response.choices[0].logprobs.top_logprobs)
        assert response.choices[0].logprobs
        if echo:
            assert response.choices[0].logprobs.token_logprobs[0] == None
        else:
            assert response.choices[0].logprobs.token_logprobs[0] != None
    assert response.id
    assert response.created
    assert response.usage.prompt_tokens > 0
    assert response.usage.completion_tokens > 0
    assert response.usage.total_tokens > 0
    print("=" * 100)
 def test_completion_stream(args, echo, logprobs):
    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
    response = client.completions.create(
        model="default",
        prompt="The capital of France is",
        temperature=0,
        max_tokens=32,
        stream=True,
        echo=echo,
        logprobs=logprobs,
    )
    first = True
    for r in response:
        if first:
            if echo:
                assert r.choices[0].text.startswith("The capital of France is")
            first = False
        if logprobs:
            print(
                f"{r.choices[0].text:12s}\t" f"{r.choices[0].logprobs.token_logprobs}",
                flush=True,
            )
            print(r.choices[0].logprobs.top_logprobs)
        else:
            print(r.choices[0].text, end="", flush=True)
        assert r.id
        assert r.usage.prompt_tokens > 0
        assert r.usage.completion_tokens > 0
        assert r.usage.total_tokens > 0
    print("=" * 100)
 def test_chat_completion(args):
    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
    response = client.chat.completions.create(
        model="default",
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant"},
            {"role": "user", "content": "What is the capital of France?"},
        ],
        temperature=0,
        max_tokens=32,
    )
    print(response.choices[0].message.content)
    assert response.id
    assert response.created
    assert response.usage.prompt_tokens > 0
    assert response.usage.completion_tokens > 0
    assert response.usage.total_tokens > 0
    print("=" * 100)
 def test_chat_completion_image(args):
    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
    response = client.chat.completions.create(
        model="default",
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant"},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this image"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/mixtral_8x7b.jpg"
                        },
                    },
                ],
            },
        ],
        temperature=0,
        max_tokens=32,
    )
    print(response.choices[0].message.content)
    assert response.id
    assert response.created
    assert response.usage.prompt_tokens > 0
    assert response.usage.completion_tokens > 0
    assert response.usage.total_tokens > 0
    print("=" * 100)
 def test_chat_completion_stream(args):
    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
    response = client.chat.completions.create(
        model="default",
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant"},
            {"role": "user", "content": "List 3 countries and their capitals."},
        ],
        temperature=0,
        max_tokens=64,
        stream=True,
    )
    is_first = True
    for chunk in response:
        if is_first:
            is_first = False
            assert chunk.choices[0].delta.role == "assistant"
            continue
        data = chunk.choices[0].delta
        if not data.content:
            continue
        print(data.content, end="", flush=True)
    print("=" * 100)
 def test_regex(args):
    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
    regex = (
        r"""\{\n"""
        + r"""   "name": "[\w]+",\n"""
        + r"""   "population": [\d]+\n"""
        + r"""\}"""
    )
    response = client.chat.completions.create(
        model="default",
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant"},
            {"role": "user", "content": "Introduce the capital of France."},
        ],
        temperature=0,
        max_tokens=128,
        extra_body={"regex": regex},
    )
    text = response.choices[0].message.content
    print(json.loads(text))
    print("=" * 100)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--base-url", type=str, default="http://127.0.0.1:30000/v1")
    parser.add_argument(
        "--test-image", action="store_true", help="Enables testing image inputs"
    )
    args = parser.parse_args()
    test_completion(args, echo=False, logprobs=False)
    test_completion(args, echo=True, logprobs=False)
    test_completion(args, echo=False, logprobs=True)
    test_completion(args, echo=True, logprobs=True)
    test_completion(args, echo=False, logprobs=3)
    test_completion(args, echo=True, logprobs=3)
    test_completion_stream(args, echo=False, logprobs=False)
    test_completion_stream(args, echo=True, logprobs=False)
    test_completion_stream(args, echo=False, logprobs=True)
    test_completion_stream(args, echo=True, logprobs=True)
    test_completion_stream(args, echo=False, logprobs=3)
    test_completion_stream(args, echo=True, logprobs=3)
    test_chat_completion(args)
    test_chat_completion_stream(args)
    test_regex(args)
    if args.test_image:
        test_chat_completion_image(args)
--- a/test/lang/test_bind_cache.py
+++ b/test/lang/test_bind_cache.py
@@ -1,7 +1,7 @@
 import unittest
 import sglang as sgl
-from sglang.test.test_utils import MODEL_NAME_FOR_TEST
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST
 class TestBind(unittest.TestCase):
@@ -9,7 +9,7 @@ class TestBind(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
-        cls.backend = sgl.Runtime(model_path=MODEL_NAME_FOR_TEST)
+        cls.backend = sgl.Runtime(model_path=DEFAULT_MODEL_NAME_FOR_TEST)
        sgl.set_default_backend(cls.backend)
    @classmethod
--- a/test/lang/test_srt_backend.py
+++ b/test/lang/test_srt_backend.py
@@ -14,7 +14,7 @@ from sglang.test.test_programs import (
    test_stream,
    test_tool_use,
 )
-from sglang.test.test_utils import MODEL_NAME_FOR_TEST
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST
 class TestSRTBackend(unittest.TestCase):
@@ -22,7 +22,7 @@ class TestSRTBackend(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
-        cls.backend = sgl.Runtime(model_path=MODEL_NAME_FOR_TEST)
+        cls.backend = sgl.Runtime(model_path=DEFAULT_MODEL_NAME_FOR_TEST)
        sgl.set_default_backend(cls.backend)
    @classmethod
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -5,8 +5,9 @@ from sglang.test.test_utils import run_unittest_files
 suites = {
    "minimal": [
        "test_openai_server.py",
        "test_eval_accuracy.py",
        "test_openai_server.py",
        "test_vision_openai_server.py",
        "test_chunked_prefill.py",
        "test_torch_compile.py",
        "models/test_causal_models.py",
--- a/test/srt/test_chunked_prefill.py
+++ b/test/srt/test_chunked_prefill.py
@@ -3,14 +3,14 @@ from types import SimpleNamespace
 from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
 class TestAccuracy(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
-        cls.model = MODEL_NAME_FOR_TEST
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
        cls.base_url = f"http://localhost:8157"
        cls.process = popen_launch_server(
            cls.model,
--- a/test/srt/test_eval_accuracy.py
+++ b/test/srt/test_eval_accuracy.py
@@ -3,14 +3,14 @@ from types import SimpleNamespace
 from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
 class TestAccuracy(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
-        cls.model = MODEL_NAME_FOR_TEST
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
        cls.base_url = f"http://localhost:8157"
        cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
--- a/test/srt/test_openai_server.py
+++ b/test/srt/test_openai_server.py
@@ -5,21 +5,21 @@ import openai
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
 class TestOpenAIServer(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
-        cls.model = MODEL_NAME_FOR_TEST
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
        cls.base_url = f"http://localhost:8157"
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model, cls.base_url, timeout=300, api_key=cls.api_key
        )
        cls.base_url += "/v1"
-        cls.tokenizer = get_tokenizer(MODEL_NAME_FOR_TEST)
+        cls.tokenizer = get_tokenizer(DEFAULT_MODEL_NAME_FOR_TEST)
    @classmethod
    def tearDownClass(cls):
@@ -147,6 +147,7 @@ class TestOpenAIServer(unittest.TestCase):
            top_logprobs=logprobs,
            n=parallel_sample_num,
        )
        if logprobs:
            assert isinstance(
                response.choices[0].logprobs.content[0].top_logprobs[0].token, str
@@ -158,6 +159,7 @@ class TestOpenAIServer(unittest.TestCase):
            assert (
                ret_num_top_logprobs == logprobs
            ), f"{ret_num_top_logprobs} vs {logprobs}"
        assert len(response.choices) == parallel_sample_num
        assert response.choices[0].message.role == "assistant"
        assert isinstance(response.choices[0].message.content, str)
--- a/test/srt/test_srt_endpoint.py
+++ b/test/srt/test_srt_endpoint.py
@@ -5,14 +5,14 @@ import requests
 from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
 class TestSRTEndpoint(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
-        cls.model = MODEL_NAME_FOR_TEST
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
        cls.base_url = f"http://localhost:{8157}"
        cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
--- a/test/srt/test_torch_compile.py
+++ b/test/srt/test_torch_compile.py
@@ -3,14 +3,14 @@ from types import SimpleNamespace
 from sglang.srt.utils import kill_child_process
 from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
 class TestAccuracy(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
-        cls.model = MODEL_NAME_FOR_TEST
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
        cls.base_url = f"http://localhost:8157"
        cls.process = popen_launch_server(
            cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"]
--- a/test/srt/test_vision_openai_server.py
+++ b/test/srt/test_vision_openai_server.py
@@ -0,0 +1,75 @@
 import json
 import unittest
 import openai
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import popen_launch_server
 class TestOpenAIVisionServer(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = "liuhaotian/llava-v1.6-vicuna-7b"
        cls.base_url = "http://localhost:8157"
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=300,
            api_key=cls.api_key,
            other_args=[
                "--chat-template",
                "vicuna_v1.1",
                "--tokenizer-path",
                "llava-hf/llava-1.5-7b-hf",
                "--log-requests",
            ],
        )
        cls.base_url += "/v1"
    @classmethod
    def tearDownClass(cls):
        kill_child_process(cls.process.pid)
    def test_chat_completion(self):
        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
        response = client.chat.completions.create(
            model="default",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": "https://github.com/sgl-project/sglang/blob/main/assets/logo.png?raw=true"
                            },
                        },
                        {"type": "text", "text": "Describe this image"},
                    ],
                },
            ],
            temperature=0,
            max_tokens=32,
        )
        assert response.choices[0].message.role == "assistant"
        assert isinstance(response.choices[0].message.content, str)
        assert response.id
        assert response.created
        assert response.usage.prompt_tokens > 0
        assert response.usage.completion_tokens > 0
        assert response.usage.total_tokens > 0
 if __name__ == "__main__":
    unittest.main(warnings="ignore")
    # t = TestOpenAIVisionServer()
    # t.setUpClass()
    # t.test_chat_completion()
    # t.tearDownClass()