From 3bc99e6fe4d77979f0e0de707a59a9cf305504d6 Mon Sep 17 00:00:00 2001 From: Ying Sheng Date: Sun, 4 Aug 2024 20:51:55 -0700 Subject: [PATCH] Test openai vision api (#925) --- README.md | 2 +- .../sglang/srt/managers/tokenizer_manager.py | 7 +- python/sglang/test/test_utils.py | 2 +- scripts/deprecated/test_openai_server.py | 209 ------------------ test/lang/test_bind_cache.py | 4 +- test/lang/test_srt_backend.py | 4 +- test/srt/run_suite.py | 3 +- test/srt/test_chunked_prefill.py | 4 +- test/srt/test_eval_accuracy.py | 4 +- test/srt/test_openai_server.py | 8 +- test/srt/test_srt_endpoint.py | 4 +- test/srt/test_torch_compile.py | 4 +- test/srt/test_vision_openai_server.py | 75 +++++++ 13 files changed, 102 insertions(+), 228 deletions(-) delete mode 100644 scripts/deprecated/test_openai_server.py create mode 100644 test/srt/test_vision_openai_server.py diff --git a/README.md b/README.md index 1cd23a010..f4bff4cad 100644 --- a/README.md +++ b/README.md @@ -136,7 +136,7 @@ response = client.chat.completions.create( print(response) ``` -It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/). +It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/). ### Additional Server Arguments - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option. diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index e44122bf1..1f45eed1f 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -390,8 +390,13 @@ class TokenizerManager: obj.return_text_in_logprobs, ) + # Log requests if self.server_args.log_requests and state.finished: - logger.info(f"in={obj.text}, out={out}") + if obj.text is None: + in_obj = {"text": self.tokenizer.decode(obj.input_ids)} + else: + in_obj = {"text": obj.text} + logger.info(f"in={in_obj}, out={out}") state.out_list = [] if state.finished: diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index be1bdb966..43f2730c7 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -18,7 +18,7 @@ from sglang.lang.backend.openai import OpenAI from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint from sglang.utils import get_exception_traceback -MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct" +DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct" def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None): diff --git a/scripts/deprecated/test_openai_server.py b/scripts/deprecated/test_openai_server.py deleted file mode 100644 index a77319b1b..000000000 --- a/scripts/deprecated/test_openai_server.py +++ /dev/null @@ -1,209 +0,0 @@ -""" -First run the following command to launch the server. -Note that TinyLlama adopts different chat templates in different versions. -For v0.4, the chat template is chatml. - -python3 -m sglang.launch_server --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 \ ---port 30000 --chat-template chatml - -Output example: -The capital of France is Paris. -The capital of the United States is Washington, D.C. -The capital of Canada is Ottawa. -The capital of Japan is Tokyo -""" - -import argparse -import json - -import openai - - -def test_completion(args, echo, logprobs): - client = openai.Client(api_key="EMPTY", base_url=args.base_url) - response = client.completions.create( - model="default", - prompt="The capital of France is", - temperature=0, - max_tokens=32, - echo=echo, - logprobs=logprobs, - ) - text = response.choices[0].text - print(response.choices[0].text) - if echo: - assert text.startswith("The capital of France is") - if logprobs: - print(response.choices[0].logprobs.top_logprobs) - assert response.choices[0].logprobs - if echo: - assert response.choices[0].logprobs.token_logprobs[0] == None - else: - assert response.choices[0].logprobs.token_logprobs[0] != None - assert response.id - assert response.created - assert response.usage.prompt_tokens > 0 - assert response.usage.completion_tokens > 0 - assert response.usage.total_tokens > 0 - print("=" * 100) - - -def test_completion_stream(args, echo, logprobs): - client = openai.Client(api_key="EMPTY", base_url=args.base_url) - response = client.completions.create( - model="default", - prompt="The capital of France is", - temperature=0, - max_tokens=32, - stream=True, - echo=echo, - logprobs=logprobs, - ) - first = True - for r in response: - if first: - if echo: - assert r.choices[0].text.startswith("The capital of France is") - first = False - if logprobs: - print( - f"{r.choices[0].text:12s}\t" f"{r.choices[0].logprobs.token_logprobs}", - flush=True, - ) - print(r.choices[0].logprobs.top_logprobs) - else: - print(r.choices[0].text, end="", flush=True) - assert r.id - assert r.usage.prompt_tokens > 0 - assert r.usage.completion_tokens > 0 - assert r.usage.total_tokens > 0 - print("=" * 100) - - -def test_chat_completion(args): - client = openai.Client(api_key="EMPTY", base_url=args.base_url) - response = client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant"}, - {"role": "user", "content": "What is the capital of France?"}, - ], - temperature=0, - max_tokens=32, - ) - print(response.choices[0].message.content) - assert response.id - assert response.created - assert response.usage.prompt_tokens > 0 - assert response.usage.completion_tokens > 0 - assert response.usage.total_tokens > 0 - print("=" * 100) - - -def test_chat_completion_image(args): - client = openai.Client(api_key="EMPTY", base_url=args.base_url) - response = client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant"}, - { - "role": "user", - "content": [ - {"type": "text", "text": "Describe this image"}, - { - "type": "image_url", - "image_url": { - "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/mixtral_8x7b.jpg" - }, - }, - ], - }, - ], - temperature=0, - max_tokens=32, - ) - print(response.choices[0].message.content) - assert response.id - assert response.created - assert response.usage.prompt_tokens > 0 - assert response.usage.completion_tokens > 0 - assert response.usage.total_tokens > 0 - print("=" * 100) - - -def test_chat_completion_stream(args): - client = openai.Client(api_key="EMPTY", base_url=args.base_url) - response = client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant"}, - {"role": "user", "content": "List 3 countries and their capitals."}, - ], - temperature=0, - max_tokens=64, - stream=True, - ) - is_first = True - for chunk in response: - if is_first: - is_first = False - assert chunk.choices[0].delta.role == "assistant" - continue - - data = chunk.choices[0].delta - if not data.content: - continue - print(data.content, end="", flush=True) - print("=" * 100) - - -def test_regex(args): - client = openai.Client(api_key="EMPTY", base_url=args.base_url) - - regex = ( - r"""\{\n""" - + r""" "name": "[\w]+",\n""" - + r""" "population": [\d]+\n""" - + r"""\}""" - ) - - response = client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant"}, - {"role": "user", "content": "Introduce the capital of France."}, - ], - temperature=0, - max_tokens=128, - extra_body={"regex": regex}, - ) - text = response.choices[0].message.content - print(json.loads(text)) - print("=" * 100) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--base-url", type=str, default="http://127.0.0.1:30000/v1") - parser.add_argument( - "--test-image", action="store_true", help="Enables testing image inputs" - ) - args = parser.parse_args() - - test_completion(args, echo=False, logprobs=False) - test_completion(args, echo=True, logprobs=False) - test_completion(args, echo=False, logprobs=True) - test_completion(args, echo=True, logprobs=True) - test_completion(args, echo=False, logprobs=3) - test_completion(args, echo=True, logprobs=3) - test_completion_stream(args, echo=False, logprobs=False) - test_completion_stream(args, echo=True, logprobs=False) - test_completion_stream(args, echo=False, logprobs=True) - test_completion_stream(args, echo=True, logprobs=True) - test_completion_stream(args, echo=False, logprobs=3) - test_completion_stream(args, echo=True, logprobs=3) - test_chat_completion(args) - test_chat_completion_stream(args) - test_regex(args) - if args.test_image: - test_chat_completion_image(args) diff --git a/test/lang/test_bind_cache.py b/test/lang/test_bind_cache.py index 53e1b9754..14a7e5098 100644 --- a/test/lang/test_bind_cache.py +++ b/test/lang/test_bind_cache.py @@ -1,7 +1,7 @@ import unittest import sglang as sgl -from sglang.test.test_utils import MODEL_NAME_FOR_TEST +from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST class TestBind(unittest.TestCase): @@ -9,7 +9,7 @@ class TestBind(unittest.TestCase): @classmethod def setUpClass(cls): - cls.backend = sgl.Runtime(model_path=MODEL_NAME_FOR_TEST) + cls.backend = sgl.Runtime(model_path=DEFAULT_MODEL_NAME_FOR_TEST) sgl.set_default_backend(cls.backend) @classmethod diff --git a/test/lang/test_srt_backend.py b/test/lang/test_srt_backend.py index 7accd349f..778cde8be 100644 --- a/test/lang/test_srt_backend.py +++ b/test/lang/test_srt_backend.py @@ -14,7 +14,7 @@ from sglang.test.test_programs import ( test_stream, test_tool_use, ) -from sglang.test.test_utils import MODEL_NAME_FOR_TEST +from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST class TestSRTBackend(unittest.TestCase): @@ -22,7 +22,7 @@ class TestSRTBackend(unittest.TestCase): @classmethod def setUpClass(cls): - cls.backend = sgl.Runtime(model_path=MODEL_NAME_FOR_TEST) + cls.backend = sgl.Runtime(model_path=DEFAULT_MODEL_NAME_FOR_TEST) sgl.set_default_backend(cls.backend) @classmethod diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index ab9ae0f41..38af8aabd 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -5,8 +5,9 @@ from sglang.test.test_utils import run_unittest_files suites = { "minimal": [ - "test_openai_server.py", "test_eval_accuracy.py", + "test_openai_server.py", + "test_vision_openai_server.py", "test_chunked_prefill.py", "test_torch_compile.py", "models/test_causal_models.py", diff --git a/test/srt/test_chunked_prefill.py b/test/srt/test_chunked_prefill.py index e98c713e8..797c3e5cc 100644 --- a/test/srt/test_chunked_prefill.py +++ b/test/srt/test_chunked_prefill.py @@ -3,14 +3,14 @@ from types import SimpleNamespace from sglang.srt.utils import kill_child_process from sglang.test.run_eval import run_eval -from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server +from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server class TestAccuracy(unittest.TestCase): @classmethod def setUpClass(cls): - cls.model = MODEL_NAME_FOR_TEST + cls.model = DEFAULT_MODEL_NAME_FOR_TEST cls.base_url = f"http://localhost:8157" cls.process = popen_launch_server( cls.model, diff --git a/test/srt/test_eval_accuracy.py b/test/srt/test_eval_accuracy.py index a6911785e..0f7ef20b0 100644 --- a/test/srt/test_eval_accuracy.py +++ b/test/srt/test_eval_accuracy.py @@ -3,14 +3,14 @@ from types import SimpleNamespace from sglang.srt.utils import kill_child_process from sglang.test.run_eval import run_eval -from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server +from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server class TestAccuracy(unittest.TestCase): @classmethod def setUpClass(cls): - cls.model = MODEL_NAME_FOR_TEST + cls.model = DEFAULT_MODEL_NAME_FOR_TEST cls.base_url = f"http://localhost:8157" cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index 45648ce1d..28b64bd96 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -5,21 +5,21 @@ import openai from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server +from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server class TestOpenAIServer(unittest.TestCase): @classmethod def setUpClass(cls): - cls.model = MODEL_NAME_FOR_TEST + cls.model = DEFAULT_MODEL_NAME_FOR_TEST cls.base_url = f"http://localhost:8157" cls.api_key = "sk-123456" cls.process = popen_launch_server( cls.model, cls.base_url, timeout=300, api_key=cls.api_key ) cls.base_url += "/v1" - cls.tokenizer = get_tokenizer(MODEL_NAME_FOR_TEST) + cls.tokenizer = get_tokenizer(DEFAULT_MODEL_NAME_FOR_TEST) @classmethod def tearDownClass(cls): @@ -147,6 +147,7 @@ class TestOpenAIServer(unittest.TestCase): top_logprobs=logprobs, n=parallel_sample_num, ) + if logprobs: assert isinstance( response.choices[0].logprobs.content[0].top_logprobs[0].token, str @@ -158,6 +159,7 @@ class TestOpenAIServer(unittest.TestCase): assert ( ret_num_top_logprobs == logprobs ), f"{ret_num_top_logprobs} vs {logprobs}" + assert len(response.choices) == parallel_sample_num assert response.choices[0].message.role == "assistant" assert isinstance(response.choices[0].message.content, str) diff --git a/test/srt/test_srt_endpoint.py b/test/srt/test_srt_endpoint.py index 76637b2f6..c8db402d8 100644 --- a/test/srt/test_srt_endpoint.py +++ b/test/srt/test_srt_endpoint.py @@ -5,14 +5,14 @@ import requests from sglang.srt.utils import kill_child_process from sglang.test.run_eval import run_eval -from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server +from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server class TestSRTEndpoint(unittest.TestCase): @classmethod def setUpClass(cls): - cls.model = MODEL_NAME_FOR_TEST + cls.model = DEFAULT_MODEL_NAME_FOR_TEST cls.base_url = f"http://localhost:{8157}" cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py index 126ee91ef..e42be1e8a 100644 --- a/test/srt/test_torch_compile.py +++ b/test/srt/test_torch_compile.py @@ -3,14 +3,14 @@ from types import SimpleNamespace from sglang.srt.utils import kill_child_process from sglang.test.run_eval import run_eval -from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server +from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server class TestAccuracy(unittest.TestCase): @classmethod def setUpClass(cls): - cls.model = MODEL_NAME_FOR_TEST + cls.model = DEFAULT_MODEL_NAME_FOR_TEST cls.base_url = f"http://localhost:8157" cls.process = popen_launch_server( cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"] diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py new file mode 100644 index 000000000..9c0829b90 --- /dev/null +++ b/test/srt/test_vision_openai_server.py @@ -0,0 +1,75 @@ +import json +import unittest + +import openai + +from sglang.srt.hf_transformers_utils import get_tokenizer +from sglang.srt.utils import kill_child_process +from sglang.test.test_utils import popen_launch_server + + +class TestOpenAIVisionServer(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.model = "liuhaotian/llava-v1.6-vicuna-7b" + cls.base_url = "http://localhost:8157" + cls.api_key = "sk-123456" + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=300, + api_key=cls.api_key, + other_args=[ + "--chat-template", + "vicuna_v1.1", + "--tokenizer-path", + "llava-hf/llava-1.5-7b-hf", + "--log-requests", + ], + ) + cls.base_url += "/v1" + + @classmethod + def tearDownClass(cls): + kill_child_process(cls.process.pid) + + def test_chat_completion(self): + client = openai.Client(api_key=self.api_key, base_url=self.base_url) + + response = client.chat.completions.create( + model="default", + messages=[ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://github.com/sgl-project/sglang/blob/main/assets/logo.png?raw=true" + }, + }, + {"type": "text", "text": "Describe this image"}, + ], + }, + ], + temperature=0, + max_tokens=32, + ) + + assert response.choices[0].message.role == "assistant" + assert isinstance(response.choices[0].message.content, str) + assert response.id + assert response.created + assert response.usage.prompt_tokens > 0 + assert response.usage.completion_tokens > 0 + assert response.usage.total_tokens > 0 + + +if __name__ == "__main__": + unittest.main(warnings="ignore") + + # t = TestOpenAIVisionServer() + # t.setUpClass() + # t.test_chat_completion() + # t.tearDownClass()