diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 11cca9d94..8561b09a9 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -20,8 +20,6 @@ concurrency: jobs: unit-test: runs-on: self-hosted - env: - CUDA_VISIBLE_DEVICES: 6 steps: - name: Checkout code @@ -30,6 +28,7 @@ jobs: - name: Install dependencies run: | cd /data/zhyncs/venv && source ./bin/activate && cd - + pip cache purge pip install --upgrade pip pip install -e "python[all]" @@ -39,6 +38,14 @@ jobs: - name: Test OpenAI Backend run: | cd /data/zhyncs/venv && source ./bin/activate && cd - - cd test/lang export OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} + + cd test/lang python3 test_openai_backend.py + + - name: Test SRT Backend + run: | + cd /data/zhyncs/venv && source ./bin/activate && cd - + + cd test/lang + python3 test_srt_backend.py diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 8c7b8ee61..1027849ca 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -73,6 +73,7 @@ from sglang.srt.utils import ( assert_pkg_version, enable_show_time_cost, maybe_set_triton_cache_manager, + kill_child_process, set_ulimit, ) from sglang.utils import get_exception_traceback @@ -467,16 +468,7 @@ class Runtime: def shutdown(self): if self.pid is not None: - try: - parent = psutil.Process(self.pid) - except psutil.NoSuchProcess: - return - children = parent.children(recursive=True) - for child in children: - child.kill() - psutil.wait_procs(children, timeout=5) - parent.kill() - parent.wait(timeout=5) + kill_child_process(self.pid) self.pid = None def cache_prefix(self, prefix: str): diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index cefa7609b..fac8bdaa1 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -366,6 +366,26 @@ def kill_parent_process(): os.kill(parent_process.pid, 9) +def kill_child_process(pid, including_parent=True): + try: + parent = psutil.Process(pid) + except psutil.NoSuchProcess: + return + + children = parent.children(recursive=True) + for child in children: + try: + child.kill() + except psutil.NoSuchProcess: + pass + + if including_parent: + try: + parent.kill() + except psutil.NoSuchProcess: + pass + + def monkey_patch_vllm_p2p_access_check(gpu_id: int): """ Monkey patch the slow p2p access check in vllm. diff --git a/python/sglang/test/test_programs.py b/python/sglang/test/test_programs.py index 6ab41d10f..710871ba5 100644 --- a/python/sglang/test/test_programs.py +++ b/python/sglang/test/test_programs.py @@ -105,15 +105,14 @@ def test_decode_json_regex(): def decode_json(s): from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STRING - s += "Generate a JSON object to describe the basic information of a city.\n" + s += "Generate a JSON object to describe the basic city information of Paris.\n" with s.var_scope("json_output"): s += "{\n" s += ' "name": ' + sgl.gen(regex=REGEX_STRING + ",") + "\n" s += ' "population": ' + sgl.gen(regex=REGEX_INT + ",") + "\n" s += ' "area": ' + sgl.gen(regex=REGEX_INT + ",") + "\n" - s += ' "latitude": ' + sgl.gen(regex=REGEX_FLOAT + ",") + "\n" - s += ' "country": ' + sgl.gen(regex=REGEX_STRING) + "\n" + s += ' "latitude": ' + sgl.gen(regex=REGEX_FLOAT) + "\n" s += "}" ret = decode_json.run(temperature=0.0) @@ -129,7 +128,7 @@ def test_decode_json_regex(): def test_decode_json(): @sgl.function def decode_json(s): - s += "Generate a JSON object to describe the basic information of a city.\n" + s += "Generate a JSON object to describe the basic city information of Paris.\n" with s.var_scope("json_output"): s += "{\n" @@ -264,6 +263,7 @@ def test_parallel_decoding(): s += "\nIn summary," + sgl.gen("summary", max_tokens=512) ret = parallel_decoding.run(topic="writing a good blog post", temperature=0.3) + assert isinstance(ret["summary"], str) def test_parallel_encoding(check_answer=True): diff --git a/test/lang/test_srt_backend.py b/test/lang/test_srt_backend.py index fae53f0b1..f9d79ed29 100644 --- a/test/lang/test_srt_backend.py +++ b/test/lang/test_srt_backend.py @@ -21,7 +21,7 @@ class TestSRTBackend(unittest.TestCase): @classmethod def setUpClass(cls): - cls.backend = sgl.Runtime(model_path="meta-llama/Meta-Llama-3-8B-Instruct") + cls.backend = sgl.Runtime(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct") sgl.set_default_backend(cls.backend) @classmethod diff --git a/test/srt/example_image.png b/test/srt/example_image.png deleted file mode 120000 index c8a970edd..000000000 --- a/test/srt/example_image.png +++ /dev/null @@ -1 +0,0 @@ -../lang/example_image.png \ No newline at end of file diff --git a/test/srt/test_curl.sh b/test/srt/old/test_curl.sh similarity index 100% rename from test/srt/test_curl.sh rename to test/srt/old/test_curl.sh diff --git a/test/srt/test_flashinfer.py b/test/srt/old/test_flashinfer.py similarity index 100% rename from test/srt/test_flashinfer.py rename to test/srt/old/test_flashinfer.py diff --git a/test/srt/test_httpserver_classify.py b/test/srt/old/test_httpserver_classify.py similarity index 100% rename from test/srt/test_httpserver_classify.py rename to test/srt/old/test_httpserver_classify.py diff --git a/test/srt/test_httpserver_concurrent.py b/test/srt/old/test_httpserver_concurrent.py similarity index 100% rename from test/srt/test_httpserver_concurrent.py rename to test/srt/old/test_httpserver_concurrent.py diff --git a/test/srt/test_httpserver_decode.py b/test/srt/old/test_httpserver_decode.py similarity index 100% rename from test/srt/test_httpserver_decode.py rename to test/srt/old/test_httpserver_decode.py diff --git a/test/srt/test_httpserver_decode_stream.py b/test/srt/old/test_httpserver_decode_stream.py similarity index 100% rename from test/srt/test_httpserver_decode_stream.py rename to test/srt/old/test_httpserver_decode_stream.py diff --git a/test/srt/test_httpserver_llava.py b/test/srt/old/test_httpserver_llava.py similarity index 100% rename from test/srt/test_httpserver_llava.py rename to test/srt/old/test_httpserver_llava.py diff --git a/test/srt/test_httpserver_reuse.py b/test/srt/old/test_httpserver_reuse.py similarity index 100% rename from test/srt/test_httpserver_reuse.py rename to test/srt/old/test_httpserver_reuse.py diff --git a/test/srt/test_jump_forward.py b/test/srt/old/test_jump_forward.py similarity index 100% rename from test/srt/test_jump_forward.py rename to test/srt/old/test_jump_forward.py diff --git a/test/srt/old/test_openai_server.py b/test/srt/old/test_openai_server.py new file mode 100644 index 000000000..a77319b1b --- /dev/null +++ b/test/srt/old/test_openai_server.py @@ -0,0 +1,209 @@ +""" +First run the following command to launch the server. +Note that TinyLlama adopts different chat templates in different versions. +For v0.4, the chat template is chatml. + +python3 -m sglang.launch_server --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 \ +--port 30000 --chat-template chatml + +Output example: +The capital of France is Paris. +The capital of the United States is Washington, D.C. +The capital of Canada is Ottawa. +The capital of Japan is Tokyo +""" + +import argparse +import json + +import openai + + +def test_completion(args, echo, logprobs): + client = openai.Client(api_key="EMPTY", base_url=args.base_url) + response = client.completions.create( + model="default", + prompt="The capital of France is", + temperature=0, + max_tokens=32, + echo=echo, + logprobs=logprobs, + ) + text = response.choices[0].text + print(response.choices[0].text) + if echo: + assert text.startswith("The capital of France is") + if logprobs: + print(response.choices[0].logprobs.top_logprobs) + assert response.choices[0].logprobs + if echo: + assert response.choices[0].logprobs.token_logprobs[0] == None + else: + assert response.choices[0].logprobs.token_logprobs[0] != None + assert response.id + assert response.created + assert response.usage.prompt_tokens > 0 + assert response.usage.completion_tokens > 0 + assert response.usage.total_tokens > 0 + print("=" * 100) + + +def test_completion_stream(args, echo, logprobs): + client = openai.Client(api_key="EMPTY", base_url=args.base_url) + response = client.completions.create( + model="default", + prompt="The capital of France is", + temperature=0, + max_tokens=32, + stream=True, + echo=echo, + logprobs=logprobs, + ) + first = True + for r in response: + if first: + if echo: + assert r.choices[0].text.startswith("The capital of France is") + first = False + if logprobs: + print( + f"{r.choices[0].text:12s}\t" f"{r.choices[0].logprobs.token_logprobs}", + flush=True, + ) + print(r.choices[0].logprobs.top_logprobs) + else: + print(r.choices[0].text, end="", flush=True) + assert r.id + assert r.usage.prompt_tokens > 0 + assert r.usage.completion_tokens > 0 + assert r.usage.total_tokens > 0 + print("=" * 100) + + +def test_chat_completion(args): + client = openai.Client(api_key="EMPTY", base_url=args.base_url) + response = client.chat.completions.create( + model="default", + messages=[ + {"role": "system", "content": "You are a helpful AI assistant"}, + {"role": "user", "content": "What is the capital of France?"}, + ], + temperature=0, + max_tokens=32, + ) + print(response.choices[0].message.content) + assert response.id + assert response.created + assert response.usage.prompt_tokens > 0 + assert response.usage.completion_tokens > 0 + assert response.usage.total_tokens > 0 + print("=" * 100) + + +def test_chat_completion_image(args): + client = openai.Client(api_key="EMPTY", base_url=args.base_url) + response = client.chat.completions.create( + model="default", + messages=[ + {"role": "system", "content": "You are a helpful AI assistant"}, + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this image"}, + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/mixtral_8x7b.jpg" + }, + }, + ], + }, + ], + temperature=0, + max_tokens=32, + ) + print(response.choices[0].message.content) + assert response.id + assert response.created + assert response.usage.prompt_tokens > 0 + assert response.usage.completion_tokens > 0 + assert response.usage.total_tokens > 0 + print("=" * 100) + + +def test_chat_completion_stream(args): + client = openai.Client(api_key="EMPTY", base_url=args.base_url) + response = client.chat.completions.create( + model="default", + messages=[ + {"role": "system", "content": "You are a helpful AI assistant"}, + {"role": "user", "content": "List 3 countries and their capitals."}, + ], + temperature=0, + max_tokens=64, + stream=True, + ) + is_first = True + for chunk in response: + if is_first: + is_first = False + assert chunk.choices[0].delta.role == "assistant" + continue + + data = chunk.choices[0].delta + if not data.content: + continue + print(data.content, end="", flush=True) + print("=" * 100) + + +def test_regex(args): + client = openai.Client(api_key="EMPTY", base_url=args.base_url) + + regex = ( + r"""\{\n""" + + r""" "name": "[\w]+",\n""" + + r""" "population": [\d]+\n""" + + r"""\}""" + ) + + response = client.chat.completions.create( + model="default", + messages=[ + {"role": "system", "content": "You are a helpful AI assistant"}, + {"role": "user", "content": "Introduce the capital of France."}, + ], + temperature=0, + max_tokens=128, + extra_body={"regex": regex}, + ) + text = response.choices[0].message.content + print(json.loads(text)) + print("=" * 100) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--base-url", type=str, default="http://127.0.0.1:30000/v1") + parser.add_argument( + "--test-image", action="store_true", help="Enables testing image inputs" + ) + args = parser.parse_args() + + test_completion(args, echo=False, logprobs=False) + test_completion(args, echo=True, logprobs=False) + test_completion(args, echo=False, logprobs=True) + test_completion(args, echo=True, logprobs=True) + test_completion(args, echo=False, logprobs=3) + test_completion(args, echo=True, logprobs=3) + test_completion_stream(args, echo=False, logprobs=False) + test_completion_stream(args, echo=True, logprobs=False) + test_completion_stream(args, echo=False, logprobs=True) + test_completion_stream(args, echo=True, logprobs=True) + test_completion_stream(args, echo=False, logprobs=3) + test_completion_stream(args, echo=True, logprobs=3) + test_chat_completion(args) + test_chat_completion_stream(args) + test_regex(args) + if args.test_image: + test_chat_completion_image(args) diff --git a/test/srt/test_robust.py b/test/srt/old/test_robust.py similarity index 100% rename from test/srt/test_robust.py rename to test/srt/old/test_robust.py diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index a77319b1b..352b5e94b 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -1,209 +1,123 @@ -""" -First run the following command to launch the server. -Note that TinyLlama adopts different chat templates in different versions. -For v0.4, the chat template is chatml. - -python3 -m sglang.launch_server --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 \ ---port 30000 --chat-template chatml - -Output example: -The capital of France is Paris. -The capital of the United States is Washington, D.C. -The capital of Canada is Ottawa. -The capital of Japan is Tokyo -""" - -import argparse -import json +import subprocess +import time +import unittest import openai +import requests + +from sglang.srt.utils import kill_child_process -def test_completion(args, echo, logprobs): - client = openai.Client(api_key="EMPTY", base_url=args.base_url) - response = client.completions.create( - model="default", - prompt="The capital of France is", - temperature=0, - max_tokens=32, - echo=echo, - logprobs=logprobs, - ) - text = response.choices[0].text - print(response.choices[0].text) - if echo: - assert text.startswith("The capital of France is") - if logprobs: - print(response.choices[0].logprobs.top_logprobs) - assert response.choices[0].logprobs +class TestOpenAIServer(unittest.TestCase): + + @classmethod + def setUpClass(cls): + model = "meta-llama/Meta-Llama-3.1-8B-Instruct" + port = 30000 + timeout = 300 + + command = [ + "python3", "-m", "sglang.launch_server", + "--model-path", model, + "--host", "localhost", + "--port", str(port), + ] + cls.process = subprocess.Popen(command, stdout=None, stderr=None) + cls.base_url = f"http://localhost:{port}/v1" + cls.model = model + + start_time = time.time() + while time.time() - start_time < timeout: + try: + response = requests.get(f"{cls.base_url}/models") + if response.status_code == 200: + return + except requests.RequestException: + pass + time.sleep(10) + raise TimeoutError("Server failed to start within the timeout period.") + + @classmethod + def tearDownClass(cls): + kill_child_process(cls.process.pid) + + def run_completion(self, echo, logprobs): + client = openai.Client(api_key="EMPTY", base_url=self.base_url) + prompt = "The capital of France is" + response = client.completions.create( + model=self.model, + prompt=prompt, + temperature=0.1, + max_tokens=32, + echo=echo, + logprobs=logprobs, + ) + text = response.choices[0].text if echo: - assert response.choices[0].logprobs.token_logprobs[0] == None - else: - assert response.choices[0].logprobs.token_logprobs[0] != None - assert response.id - assert response.created - assert response.usage.prompt_tokens > 0 - assert response.usage.completion_tokens > 0 - assert response.usage.total_tokens > 0 - print("=" * 100) - - -def test_completion_stream(args, echo, logprobs): - client = openai.Client(api_key="EMPTY", base_url=args.base_url) - response = client.completions.create( - model="default", - prompt="The capital of France is", - temperature=0, - max_tokens=32, - stream=True, - echo=echo, - logprobs=logprobs, - ) - first = True - for r in response: - if first: - if echo: - assert r.choices[0].text.startswith("The capital of France is") - first = False + assert text.startswith(prompt) if logprobs: - print( - f"{r.choices[0].text:12s}\t" f"{r.choices[0].logprobs.token_logprobs}", - flush=True, - ) - print(r.choices[0].logprobs.top_logprobs) - else: - print(r.choices[0].text, end="", flush=True) - assert r.id - assert r.usage.prompt_tokens > 0 - assert r.usage.completion_tokens > 0 - assert r.usage.total_tokens > 0 - print("=" * 100) + assert response.choices[0].logprobs + assert isinstance(response.choices[0].logprobs.tokens[0], str) + assert isinstance(response.choices[0].logprobs.top_logprobs[1], dict) + assert len(response.choices[0].logprobs.top_logprobs[1]) == logprobs + if echo: + assert response.choices[0].logprobs.token_logprobs[0] == None + else: + assert response.choices[0].logprobs.token_logprobs[0] != None + assert response.id + assert response.created + assert response.usage.prompt_tokens > 0 + assert response.usage.completion_tokens > 0 + assert response.usage.total_tokens > 0 + def run_completion_stream(self, echo, logprobs): + client = openai.Client(api_key="EMPTY", base_url=self.base_url) + prompt = "The capital of France is" + generator = client.completions.create( + model=self.model, + prompt=prompt, + temperature=0.1, + max_tokens=32, + echo=echo, + logprobs=logprobs, + stream=True, + ) -def test_chat_completion(args): - client = openai.Client(api_key="EMPTY", base_url=args.base_url) - response = client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant"}, - {"role": "user", "content": "What is the capital of France?"}, - ], - temperature=0, - max_tokens=32, - ) - print(response.choices[0].message.content) - assert response.id - assert response.created - assert response.usage.prompt_tokens > 0 - assert response.usage.completion_tokens > 0 - assert response.usage.total_tokens > 0 - print("=" * 100) + first = True + for response in generator: + if logprobs: + assert response.choices[0].logprobs + assert isinstance(response.choices[0].logprobs.tokens[0], str) + if not (first and echo): + assert isinstance(response.choices[0].logprobs.top_logprobs[0], dict) + #assert len(response.choices[0].logprobs.top_logprobs[0]) == logprobs + if first: + if echo: + assert response.choices[0].text.startswith(prompt) + first = False -def test_chat_completion_image(args): - client = openai.Client(api_key="EMPTY", base_url=args.base_url) - response = client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant"}, - { - "role": "user", - "content": [ - {"type": "text", "text": "Describe this image"}, - { - "type": "image_url", - "image_url": { - "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/mixtral_8x7b.jpg" - }, - }, - ], - }, - ], - temperature=0, - max_tokens=32, - ) - print(response.choices[0].message.content) - assert response.id - assert response.created - assert response.usage.prompt_tokens > 0 - assert response.usage.completion_tokens > 0 - assert response.usage.total_tokens > 0 - print("=" * 100) + assert response.id + assert response.created + assert response.usage.prompt_tokens > 0 + assert response.usage.completion_tokens > 0 + assert response.usage.total_tokens > 0 + def test_completion(self): + for echo in [False, True]: + for logprobs in [None, 5]: + self.run_completion(echo, logprobs) -def test_chat_completion_stream(args): - client = openai.Client(api_key="EMPTY", base_url=args.base_url) - response = client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant"}, - {"role": "user", "content": "List 3 countries and their capitals."}, - ], - temperature=0, - max_tokens=64, - stream=True, - ) - is_first = True - for chunk in response: - if is_first: - is_first = False - assert chunk.choices[0].delta.role == "assistant" - continue - - data = chunk.choices[0].delta - if not data.content: - continue - print(data.content, end="", flush=True) - print("=" * 100) - - -def test_regex(args): - client = openai.Client(api_key="EMPTY", base_url=args.base_url) - - regex = ( - r"""\{\n""" - + r""" "name": "[\w]+",\n""" - + r""" "population": [\d]+\n""" - + r"""\}""" - ) - - response = client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant"}, - {"role": "user", "content": "Introduce the capital of France."}, - ], - temperature=0, - max_tokens=128, - extra_body={"regex": regex}, - ) - text = response.choices[0].message.content - print(json.loads(text)) - print("=" * 100) + def test_completion_stream(self): + for echo in [True]: + for logprobs in [5]: + self.run_completion_stream(echo, logprobs) if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--base-url", type=str, default="http://127.0.0.1:30000/v1") - parser.add_argument( - "--test-image", action="store_true", help="Enables testing image inputs" - ) - args = parser.parse_args() + # unittest.main(warnings="ignore") - test_completion(args, echo=False, logprobs=False) - test_completion(args, echo=True, logprobs=False) - test_completion(args, echo=False, logprobs=True) - test_completion(args, echo=True, logprobs=True) - test_completion(args, echo=False, logprobs=3) - test_completion(args, echo=True, logprobs=3) - test_completion_stream(args, echo=False, logprobs=False) - test_completion_stream(args, echo=True, logprobs=False) - test_completion_stream(args, echo=False, logprobs=True) - test_completion_stream(args, echo=True, logprobs=True) - test_completion_stream(args, echo=False, logprobs=3) - test_completion_stream(args, echo=True, logprobs=3) - test_chat_completion(args) - test_chat_completion_stream(args) - test_regex(args) - if args.test_image: - test_chat_completion_image(args) + t = TestOpenAIServer() + t.setUpClass() + t.test_completion_stream() + t.tearDownClass()