Make scripts under /test/srt as unit tests (#875)

2024-08-01 14:34:55 -07:00
parent e4d3333c6c
commit 72b6ea88b4
18 changed files with 353 additions and 212 deletions
--- a/.github/workflows/unit-test.yml
+++ b/.github/workflows/unit-test.yml
@@ -20,8 +20,6 @@ concurrency:
 jobs:
  unit-test:
    runs-on: self-hosted
-    env:
-      CUDA_VISIBLE_DEVICES: 6

    steps:
    - name: Checkout code
@@ -30,6 +28,7 @@ jobs:
    - name: Install dependencies
      run: |
        cd /data/zhyncs/venv && source ./bin/activate && cd -
+
        pip cache purge
        pip install --upgrade pip
        pip install -e "python[all]"
@@ -39,6 +38,14 @@ jobs:
    - name: Test OpenAI Backend
      run: |
        cd /data/zhyncs/venv && source ./bin/activate && cd -
-        cd test/lang
        export OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}
+
+        cd test/lang
        python3 test_openai_backend.py
+
+    - name: Test SRT Backend
+      run: |
+        cd /data/zhyncs/venv && source ./bin/activate && cd -
+
+        cd test/lang
+        python3 test_srt_backend.py
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -73,6 +73,7 @@ from sglang.srt.utils import (
    assert_pkg_version,
    enable_show_time_cost,
    maybe_set_triton_cache_manager,
+    kill_child_process,
    set_ulimit,
 )
 from sglang.utils import get_exception_traceback
@@ -467,16 +468,7 @@ class Runtime:

    def shutdown(self):
        if self.pid is not None:
-            try:
-                parent = psutil.Process(self.pid)
-            except psutil.NoSuchProcess:
-                return
-            children = parent.children(recursive=True)
-            for child in children:
-                child.kill()
-            psutil.wait_procs(children, timeout=5)
-            parent.kill()
-            parent.wait(timeout=5)
+            kill_child_process(self.pid)
            self.pid = None

    def cache_prefix(self, prefix: str):
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -366,6 +366,26 @@ def kill_parent_process():
    os.kill(parent_process.pid, 9)


+def kill_child_process(pid, including_parent=True):
+    try:
+        parent = psutil.Process(pid)
+    except psutil.NoSuchProcess:
+        return
+
+    children = parent.children(recursive=True)
+    for child in children:
+        try:
+            child.kill()
+        except psutil.NoSuchProcess:
+            pass
+
+    if including_parent:
+        try:
+            parent.kill()
+        except psutil.NoSuchProcess:
+            pass
+
+
 def monkey_patch_vllm_p2p_access_check(gpu_id: int):
    """
    Monkey patch the slow p2p access check in vllm.
--- a/python/sglang/test/test_programs.py
+++ b/python/sglang/test/test_programs.py
@@ -105,15 +105,14 @@ def test_decode_json_regex():
    def decode_json(s):
        from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STRING

-        s += "Generate a JSON object to describe the basic information of a city.\n"
+        s += "Generate a JSON object to describe the basic city information of Paris.\n"

        with s.var_scope("json_output"):
            s += "{\n"
            s += '  "name": ' + sgl.gen(regex=REGEX_STRING + ",") + "\n"
            s += '  "population": ' + sgl.gen(regex=REGEX_INT + ",") + "\n"
            s += '  "area": ' + sgl.gen(regex=REGEX_INT + ",") + "\n"
-            s += '  "latitude": ' + sgl.gen(regex=REGEX_FLOAT + ",") + "\n"
-            s += '  "country": ' + sgl.gen(regex=REGEX_STRING) + "\n"
+            s += '  "latitude": ' + sgl.gen(regex=REGEX_FLOAT) + "\n"
            s += "}"

    ret = decode_json.run(temperature=0.0)
@@ -129,7 +128,7 @@ def test_decode_json_regex():
 def test_decode_json():
    @sgl.function
    def decode_json(s):
-        s += "Generate a JSON object to describe the basic information of a city.\n"
+        s += "Generate a JSON object to describe the basic city information of Paris.\n"

        with s.var_scope("json_output"):
            s += "{\n"
@@ -264,6 +263,7 @@ def test_parallel_decoding():
        s += "\nIn summary," + sgl.gen("summary", max_tokens=512)

    ret = parallel_decoding.run(topic="writing a good blog post", temperature=0.3)
+    assert isinstance(ret["summary"], str)


 def test_parallel_encoding(check_answer=True):
--- a/test/lang/test_srt_backend.py
+++ b/test/lang/test_srt_backend.py
@@ -21,7 +21,7 @@ class TestSRTBackend(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
-        cls.backend = sgl.Runtime(model_path="meta-llama/Meta-Llama-3-8B-Instruct")
+        cls.backend = sgl.Runtime(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
        sgl.set_default_backend(cls.backend)

    @classmethod
--- a/test/srt/example_image.png
+++ b/test/srt/example_image.png
@@ -1 +0,0 @@
-../lang/example_image.png
--- a/test/srt/old/test_curl.sh
+++ b/test/srt/old/test_curl.sh
--- a/test/srt/old/test_flashinfer.py
+++ b/test/srt/old/test_flashinfer.py
--- a/test/srt/old/test_httpserver_classify.py
+++ b/test/srt/old/test_httpserver_classify.py
--- a/test/srt/old/test_httpserver_concurrent.py
+++ b/test/srt/old/test_httpserver_concurrent.py
--- a/test/srt/old/test_httpserver_decode.py
+++ b/test/srt/old/test_httpserver_decode.py
--- a/test/srt/old/test_httpserver_decode_stream.py
+++ b/test/srt/old/test_httpserver_decode_stream.py
--- a/test/srt/old/test_httpserver_llava.py
+++ b/test/srt/old/test_httpserver_llava.py
--- a/test/srt/old/test_httpserver_reuse.py
+++ b/test/srt/old/test_httpserver_reuse.py
--- a/test/srt/old/test_jump_forward.py
+++ b/test/srt/old/test_jump_forward.py
--- a/test/srt/old/test_openai_server.py
+++ b/test/srt/old/test_openai_server.py
@@ -0,0 +1,209 @@
+"""
+First run the following command to launch the server.
+Note that TinyLlama adopts different chat templates in different versions.
+For v0.4, the chat template is chatml.
+
+python3 -m sglang.launch_server --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 \
+--port 30000 --chat-template chatml
+
+Output example:
+The capital of France is Paris.
+The capital of the United States is Washington, D.C.
+The capital of Canada is Ottawa.
+The capital of Japan is Tokyo
+"""
+
+import argparse
+import json
+
+import openai
+
+
+def test_completion(args, echo, logprobs):
+    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
+    response = client.completions.create(
+        model="default",
+        prompt="The capital of France is",
+        temperature=0,
+        max_tokens=32,
+        echo=echo,
+        logprobs=logprobs,
+    )
+    text = response.choices[0].text
+    print(response.choices[0].text)
+    if echo:
+        assert text.startswith("The capital of France is")
+    if logprobs:
+        print(response.choices[0].logprobs.top_logprobs)
+        assert response.choices[0].logprobs
+        if echo:
+            assert response.choices[0].logprobs.token_logprobs[0] == None
+        else:
+            assert response.choices[0].logprobs.token_logprobs[0] != None
+    assert response.id
+    assert response.created
+    assert response.usage.prompt_tokens > 0
+    assert response.usage.completion_tokens > 0
+    assert response.usage.total_tokens > 0
+    print("=" * 100)
+
+
+def test_completion_stream(args, echo, logprobs):
+    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
+    response = client.completions.create(
+        model="default",
+        prompt="The capital of France is",
+        temperature=0,
+        max_tokens=32,
+        stream=True,
+        echo=echo,
+        logprobs=logprobs,
+    )
+    first = True
+    for r in response:
+        if first:
+            if echo:
+                assert r.choices[0].text.startswith("The capital of France is")
+            first = False
+        if logprobs:
+            print(
+                f"{r.choices[0].text:12s}\t" f"{r.choices[0].logprobs.token_logprobs}",
+                flush=True,
+            )
+            print(r.choices[0].logprobs.top_logprobs)
+        else:
+            print(r.choices[0].text, end="", flush=True)
+        assert r.id
+        assert r.usage.prompt_tokens > 0
+        assert r.usage.completion_tokens > 0
+        assert r.usage.total_tokens > 0
+    print("=" * 100)
+
+
+def test_chat_completion(args):
+    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
+    response = client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant"},
+            {"role": "user", "content": "What is the capital of France?"},
+        ],
+        temperature=0,
+        max_tokens=32,
+    )
+    print(response.choices[0].message.content)
+    assert response.id
+    assert response.created
+    assert response.usage.prompt_tokens > 0
+    assert response.usage.completion_tokens > 0
+    assert response.usage.total_tokens > 0
+    print("=" * 100)
+
+
+def test_chat_completion_image(args):
+    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
+    response = client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant"},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Describe this image"},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/mixtral_8x7b.jpg"
+                        },
+                    },
+                ],
+            },
+        ],
+        temperature=0,
+        max_tokens=32,
+    )
+    print(response.choices[0].message.content)
+    assert response.id
+    assert response.created
+    assert response.usage.prompt_tokens > 0
+    assert response.usage.completion_tokens > 0
+    assert response.usage.total_tokens > 0
+    print("=" * 100)
+
+
+def test_chat_completion_stream(args):
+    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
+    response = client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant"},
+            {"role": "user", "content": "List 3 countries and their capitals."},
+        ],
+        temperature=0,
+        max_tokens=64,
+        stream=True,
+    )
+    is_first = True
+    for chunk in response:
+        if is_first:
+            is_first = False
+            assert chunk.choices[0].delta.role == "assistant"
+            continue
+
+        data = chunk.choices[0].delta
+        if not data.content:
+            continue
+        print(data.content, end="", flush=True)
+    print("=" * 100)
+
+
+def test_regex(args):
+    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
+
+    regex = (
+        r"""\{\n"""
+        + r"""   "name": "[\w]+",\n"""
+        + r"""   "population": [\d]+\n"""
+        + r"""\}"""
+    )
+
+    response = client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "system", "content": "You are a helpful AI assistant"},
+            {"role": "user", "content": "Introduce the capital of France."},
+        ],
+        temperature=0,
+        max_tokens=128,
+        extra_body={"regex": regex},
+    )
+    text = response.choices[0].message.content
+    print(json.loads(text))
+    print("=" * 100)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-url", type=str, default="http://127.0.0.1:30000/v1")
+    parser.add_argument(
+        "--test-image", action="store_true", help="Enables testing image inputs"
+    )
+    args = parser.parse_args()
+
+    test_completion(args, echo=False, logprobs=False)
+    test_completion(args, echo=True, logprobs=False)
+    test_completion(args, echo=False, logprobs=True)
+    test_completion(args, echo=True, logprobs=True)
+    test_completion(args, echo=False, logprobs=3)
+    test_completion(args, echo=True, logprobs=3)
+    test_completion_stream(args, echo=False, logprobs=False)
+    test_completion_stream(args, echo=True, logprobs=False)
+    test_completion_stream(args, echo=False, logprobs=True)
+    test_completion_stream(args, echo=True, logprobs=True)
+    test_completion_stream(args, echo=False, logprobs=3)
+    test_completion_stream(args, echo=True, logprobs=3)
+    test_chat_completion(args)
+    test_chat_completion_stream(args)
+    test_regex(args)
+    if args.test_image:
+        test_chat_completion_image(args)
--- a/test/srt/old/test_robust.py
+++ b/test/srt/old/test_robust.py
--- a/test/srt/test_openai_server.py
+++ b/test/srt/test_openai_server.py
@@ -1,209 +1,123 @@
-"""
-First run the following command to launch the server.
-Note that TinyLlama adopts different chat templates in different versions.
-For v0.4, the chat template is chatml.
-
-python3 -m sglang.launch_server --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 \
--port 30000 --chat-template chatml
-
-Output example:
-The capital of France is Paris.
-The capital of the United States is Washington, D.C.
-The capital of Canada is Ottawa.
-The capital of Japan is Tokyo
-"""
-
-import argparse
-import json
+import subprocess
+import time
+import unittest

 import openai
+import requests
+
+from sglang.srt.utils import kill_child_process


-def test_completion(args, echo, logprobs):
-    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
-    response = client.completions.create(
-        model="default",
-        prompt="The capital of France is",
-        temperature=0,
-        max_tokens=32,
-        echo=echo,
-        logprobs=logprobs,
-    )
-    text = response.choices[0].text
-    print(response.choices[0].text)
-    if echo:
-        assert text.startswith("The capital of France is")
-    if logprobs:
-        print(response.choices[0].logprobs.top_logprobs)
-        assert response.choices[0].logprobs
+class TestOpenAIServer(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+        port = 30000
+        timeout = 300
+
+        command = [
+            "python3", "-m", "sglang.launch_server",
+            "--model-path", model,
+            "--host", "localhost",
+            "--port", str(port),
+        ]
+        cls.process = subprocess.Popen(command, stdout=None, stderr=None)
+        cls.base_url = f"http://localhost:{port}/v1"
+        cls.model = model
+
+        start_time = time.time()
+        while time.time() - start_time < timeout:
+            try:
+                response = requests.get(f"{cls.base_url}/models")
+                if response.status_code == 200:
+                    return
+            except requests.RequestException:
+                pass
+            time.sleep(10)
+        raise TimeoutError("Server failed to start within the timeout period.")
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_child_process(cls.process.pid)
+
+    def run_completion(self, echo, logprobs):
+        client = openai.Client(api_key="EMPTY", base_url=self.base_url)
+        prompt = "The capital of France is"
+        response = client.completions.create(
+            model=self.model,
+            prompt=prompt,
+            temperature=0.1,
+            max_tokens=32,
+            echo=echo,
+            logprobs=logprobs,
+        )
+        text = response.choices[0].text
        if echo:
-            assert response.choices[0].logprobs.token_logprobs[0] == None
-        else:
-            assert response.choices[0].logprobs.token_logprobs[0] != None
-    assert response.id
-    assert response.created
-    assert response.usage.prompt_tokens > 0
-    assert response.usage.completion_tokens > 0
-    assert response.usage.total_tokens > 0
-    print("=" * 100)
-
-
-def test_completion_stream(args, echo, logprobs):
-    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
-    response = client.completions.create(
-        model="default",
-        prompt="The capital of France is",
-        temperature=0,
-        max_tokens=32,
-        stream=True,
-        echo=echo,
-        logprobs=logprobs,
-    )
-    first = True
-    for r in response:
-        if first:
-            if echo:
-                assert r.choices[0].text.startswith("The capital of France is")
-            first = False
+            assert text.startswith(prompt)
        if logprobs:
-            print(
-                f"{r.choices[0].text:12s}\t" f"{r.choices[0].logprobs.token_logprobs}",
-                flush=True,
-            )
-            print(r.choices[0].logprobs.top_logprobs)
-        else:
-            print(r.choices[0].text, end="", flush=True)
-        assert r.id
-        assert r.usage.prompt_tokens > 0
-        assert r.usage.completion_tokens > 0
-        assert r.usage.total_tokens > 0
-    print("=" * 100)
+            assert response.choices[0].logprobs
+            assert isinstance(response.choices[0].logprobs.tokens[0], str)
+            assert isinstance(response.choices[0].logprobs.top_logprobs[1], dict)
+            assert len(response.choices[0].logprobs.top_logprobs[1]) == logprobs
+            if echo:
+                assert response.choices[0].logprobs.token_logprobs[0] == None
+            else:
+                assert response.choices[0].logprobs.token_logprobs[0] != None
+        assert response.id
+        assert response.created
+        assert response.usage.prompt_tokens > 0
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens > 0

+    def run_completion_stream(self, echo, logprobs):
+        client = openai.Client(api_key="EMPTY", base_url=self.base_url)
+        prompt = "The capital of France is"
+        generator = client.completions.create(
+            model=self.model,
+            prompt=prompt,
+            temperature=0.1,
+            max_tokens=32,
+            echo=echo,
+            logprobs=logprobs,
+            stream=True,
+        )

-def test_chat_completion(args):
-    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
-    response = client.chat.completions.create(
-        model="default",
-        messages=[
-            {"role": "system", "content": "You are a helpful AI assistant"},
-            {"role": "user", "content": "What is the capital of France?"},
-        ],
-        temperature=0,
-        max_tokens=32,
-    )
-    print(response.choices[0].message.content)
-    assert response.id
-    assert response.created
-    assert response.usage.prompt_tokens > 0
-    assert response.usage.completion_tokens > 0
-    assert response.usage.total_tokens > 0
-    print("=" * 100)
+        first = True
+        for response in generator:
+            if logprobs:
+                assert response.choices[0].logprobs
+                assert isinstance(response.choices[0].logprobs.tokens[0], str)
+                if not (first and echo):
+                    assert isinstance(response.choices[0].logprobs.top_logprobs[0], dict)
+                    #assert len(response.choices[0].logprobs.top_logprobs[0]) == logprobs

+            if first:
+                if echo:
+                    assert response.choices[0].text.startswith(prompt)
+                first = False

-def test_chat_completion_image(args):
-    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
-    response = client.chat.completions.create(
-        model="default",
-        messages=[
-            {"role": "system", "content": "You are a helpful AI assistant"},
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "Describe this image"},
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/mixtral_8x7b.jpg"
-                        },
-                    },
-                ],
-            },
-        ],
-        temperature=0,
-        max_tokens=32,
-    )
-    print(response.choices[0].message.content)
-    assert response.id
-    assert response.created
-    assert response.usage.prompt_tokens > 0
-    assert response.usage.completion_tokens > 0
-    assert response.usage.total_tokens > 0
-    print("=" * 100)
+            assert response.id
+            assert response.created
+            assert response.usage.prompt_tokens > 0
+            assert response.usage.completion_tokens > 0
+            assert response.usage.total_tokens > 0

+    def test_completion(self):
+        for echo in [False, True]:
+            for logprobs in [None, 5]:
+                self.run_completion(echo, logprobs)

-def test_chat_completion_stream(args):
-    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
-    response = client.chat.completions.create(
-        model="default",
-        messages=[
-            {"role": "system", "content": "You are a helpful AI assistant"},
-            {"role": "user", "content": "List 3 countries and their capitals."},
-        ],
-        temperature=0,
-        max_tokens=64,
-        stream=True,
-    )
-    is_first = True
-    for chunk in response:
-        if is_first:
-            is_first = False
-            assert chunk.choices[0].delta.role == "assistant"
-            continue
-
-        data = chunk.choices[0].delta
-        if not data.content:
-            continue
-        print(data.content, end="", flush=True)
-    print("=" * 100)
-
-
-def test_regex(args):
-    client = openai.Client(api_key="EMPTY", base_url=args.base_url)
-
-    regex = (
-        r"""\{\n"""
-        + r"""   "name": "[\w]+",\n"""
-        + r"""   "population": [\d]+\n"""
-        + r"""\}"""
-    )
-
-    response = client.chat.completions.create(
-        model="default",
-        messages=[
-            {"role": "system", "content": "You are a helpful AI assistant"},
-            {"role": "user", "content": "Introduce the capital of France."},
-        ],
-        temperature=0,
-        max_tokens=128,
-        extra_body={"regex": regex},
-    )
-    text = response.choices[0].message.content
-    print(json.loads(text))
-    print("=" * 100)
+    def test_completion_stream(self):
+        for echo in [True]:
+            for logprobs in [5]:
+                self.run_completion_stream(echo, logprobs)


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--base-url", type=str, default="http://127.0.0.1:30000/v1")
-    parser.add_argument(
-        "--test-image", action="store_true", help="Enables testing image inputs"
-    )
-    args = parser.parse_args()
+    # unittest.main(warnings="ignore")

-    test_completion(args, echo=False, logprobs=False)
-    test_completion(args, echo=True, logprobs=False)
-    test_completion(args, echo=False, logprobs=True)
-    test_completion(args, echo=True, logprobs=True)
-    test_completion(args, echo=False, logprobs=3)
-    test_completion(args, echo=True, logprobs=3)
-    test_completion_stream(args, echo=False, logprobs=False)
-    test_completion_stream(args, echo=True, logprobs=False)
-    test_completion_stream(args, echo=False, logprobs=True)
-    test_completion_stream(args, echo=True, logprobs=True)
-    test_completion_stream(args, echo=False, logprobs=3)
-    test_completion_stream(args, echo=True, logprobs=3)
-    test_chat_completion(args)
-    test_chat_completion_stream(args)
-    test_regex(args)
-    if args.test_image:
-        test_chat_completion_image(args)
+    t = TestOpenAIServer()
+    t.setUpClass()
+    t.test_completion_stream()
+    t.tearDownClass()