From 54fb1c80c0d7bbf100d4efc84d1aad4bee094ff0 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 10 Aug 2024 15:09:03 -0700
Subject: [PATCH] Clean up unit tests (#1020)

---
 .github/workflows/unit-test.yml               | 10 +++---
 README.md                                     | 24 ++++++-------
 docs/en/contributor_guide.md                  |  5 ++-
 python/sglang/srt/managers/tp_worker.py       |  7 ++--
 python/sglang/test/test_programs.py           |  6 ++--
 test/README.md                                | 36 +++++++++++--------
 test/lang/test_anthropic_backend.py           |  9 +----
 test/lang/test_bind_cache.py                  |  6 +---
 test/lang/test_choices.py                     |  7 +---
 test/lang/test_litellm_backend.py             |  2 +-
 test/lang/test_openai_backend.py              |  9 +----
 test/lang/test_srt_backend.py                 | 10 +-----
 test/lang/test_tracing.py                     |  5 +--
 test/lang/test_vertexai_backend.py            | 21 +++--------
 test/srt/run_suite.py                         |  2 +-
 test/srt/test_chunked_prefill.py              |  7 +---
 test/srt/test_embedding_openai_server.py      | 16 +++------
 test/srt/test_eval_accuracy.py                |  7 +---
 test/srt/test_models_from_modelscope.py       |  2 +-
 test/srt/test_openai_server.py                |  7 +---
 ...zer_srt.py => test_skip_tokenizer_init.py} | 24 ++++---------
 test/srt/test_srt_endpoint.py                 |  3 +-
 test/srt/test_torch_compile.py                |  7 +---
 test/srt/test_vision_openai_server.py         |  7 +---
 24 files changed, 82 insertions(+), 157 deletions(-)
 rename test/srt/{test_skip_tokenizer_srt.py => test_skip_tokenizer_init.py} (73%)

diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
index 8d4ddcdb7..f9b79dc67 100644
--- a/.github/workflows/unit-test.yml
+++ b/.github/workflows/unit-test.yml
@@ -37,12 +37,12 @@ jobs:
         pip install accelerate
         pip install sentence_transformers
 
-    - name: Test Frontend Language
-      run: |
-        cd test/lang
-        python3 run_suite.py --suite minimal
-
     - name: Test Backend Runtime
       run: |
         cd test/srt
         python3 run_suite.py --suite minimal
+
+    - name: Test Frontend Language
+      run: |
+        cd test/lang
+        python3 run_suite.py --suite minimal
diff --git a/README.md b/README.md
index 9be13509f..8cccd6a37 100644
--- a/README.md
+++ b/README.md
@@ -167,17 +167,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
 - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
-
-### Use Models From ModelScope
-To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
-```
-export SGLANG_USE_MODELSCOPE=true
-```
-Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
-```
-SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
-```    
-  
+ 
 ### Supported Models
 
 - Llama / Llama 2 / Llama 3 / Llama 3.1
@@ -203,7 +193,17 @@ SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen
 
 Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
 
-### Run Llama 3.1 405B
+#### Use Models From ModelScope
+To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
+```
+export SGLANG_USE_MODELSCOPE=true
+```
+Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
+```
+SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
+```    
+
+#### Run Llama 3.1 405B
 
 ```bash
 ## Run 405B (fp8) on a single node
diff --git a/docs/en/contributor_guide.md b/docs/en/contributor_guide.md
index 7a87187c1..1ebdd0379 100644
--- a/docs/en/contributor_guide.md
+++ b/docs/en/contributor_guide.md
@@ -6,6 +6,9 @@ Use these commands to format your code and pass CI linting tests.
 ```
 pip3 install pre-commit
 cd sglang
-pre-commit install .
+pre-commit install
 pre-commit run --all-files
 ```
+
+## Add Unit Tests
+Add unit tests under [sglang/test](../../test). You can learn how to add and run tests from the README.md in that folder.
diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py
index c66897710..e425a3c37 100644
--- a/python/sglang/srt/managers/tp_worker.py
+++ b/python/sglang/srt/managers/tp_worker.py
@@ -461,8 +461,11 @@ class ModelTpServer:
                 next_token_ids = next_token_ids.tolist()
             else:
                 if self.tokenizer is None:
-                    for i, req in enumerate(batch.reqs):
-                        next_token_ids.extend(req.sampling_params.stop_token_ids)
+                    next_token_ids = []
+                    for req in batch.reqs:
+                        next_token_ids.append(
+                            next(iter(req.sampling_params.stop_token_ids))
+                        )
                 else:
                     next_token_ids = [self.tokenizer.eos_token_id] * len(batch.reqs)
 
diff --git a/python/sglang/test/test_programs.py b/python/sglang/test/test_programs.py
index 710871ba5..7c7c9bdcb 100644
--- a/python/sglang/test/test_programs.py
+++ b/python/sglang/test/test_programs.py
@@ -149,7 +149,7 @@ def test_decode_json():
     assert isinstance(js_obj["population"], int)
 
 
-def test_expert_answer():
+def test_expert_answer(check_answer=True):
     @sgl.function
     def expert_answer(s, question):
         s += "Question: " + question + "\n"
@@ -167,7 +167,9 @@ def test_expert_answer():
         )
 
     ret = expert_answer.run(question="What is the capital of France?", temperature=0.1)
-    assert "paris" in ret.text().lower()
+
+    if check_answer:
+        assert "paris" in ret.text().lower(), f"Answer: {ret.text()}"
 
 
 def test_tool_use():
diff --git a/test/README.md b/test/README.md
index cdfbbaee8..b9cf63ff1 100644
--- a/test/README.md
+++ b/test/README.md
@@ -1,26 +1,32 @@
 # Run Unit Tests
 
-## Test Frontend Language
+SGLang uses the built-in library [unittest](https://docs.python.org/3/library/unittest.html) as the testing framework.  
+
+## Test Backend Runtime
+```bash
+cd sglang/test/srt
+
+# Run a single file
+python3 test_srt_endpoint.py
+
+# Run a single test
+python3 -m unittest test_srt_endpoint.TestSRTEndpoint.test_simple_decode
+
+# Run a suite with multiple files
+python3 run_suite.py --suite minimal
 ```
+
+## Test Frontend Language
+```bash
 cd sglang/test/lang
 export OPENAI_API_KEY=sk-*****
 
 # Run a single file
 python3 test_openai_backend.py
 
-# Run a suite
+# Run a single test
+python3 -m unittest test_openai_backend.TestOpenAIBackend.test_few_shot_qa
+
+# Run a suite with multiple files
 python3 run_suite.py --suite minimal
 ```
-
-## Test Backend Runtime
-```
-cd sglang/test/srt
-
-# Run a single file
-python3 test_eval_accuracy.py
-
-# Run a suite
-python3 run_suite.py --suite minimal
-```
-
-
diff --git a/test/lang/test_anthropic_backend.py b/test/lang/test_anthropic_backend.py
index 87b27a765..03911449d 100644
--- a/test/lang/test_anthropic_backend.py
+++ b/test/lang/test_anthropic_backend.py
@@ -21,11 +21,4 @@ class TestAnthropicBackend(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(warnings="ignore")
-
-    # from sglang.global_config import global_config
-
-    # global_config.verbosity = 2
-    # t = TestAnthropicBackend()
-    # t.setUpClass()
-    # t.test_mt_bench()
+    unittest.main()
diff --git a/test/lang/test_bind_cache.py b/test/lang/test_bind_cache.py
index 14a7e5098..5ed68ff45 100644
--- a/test/lang/test_bind_cache.py
+++ b/test/lang/test_bind_cache.py
@@ -48,8 +48,4 @@ class TestBind(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(warnings="ignore")
-
-    # t = TestBind()
-    # t.setUpClass()
-    # t.test_cache()
+    unittest.main()
diff --git a/test/lang/test_choices.py b/test/lang/test_choices.py
index da25e9e49..88cd22dfb 100644
--- a/test/lang/test_choices.py
+++ b/test/lang/test_choices.py
@@ -87,9 +87,4 @@ class TestChoices(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(warnings="ignore")
-
-    # t = TestChoices()
-    # t.test_token_length_normalized()
-    # t.test_greedy_token_selection()
-    # t.test_unconditional_likelihood_normalized()
+    unittest.main()
diff --git a/test/lang/test_litellm_backend.py b/test/lang/test_litellm_backend.py
index 3c7f5db21..649e2e4d3 100644
--- a/test/lang/test_litellm_backend.py
+++ b/test/lang/test_litellm_backend.py
@@ -21,4 +21,4 @@ class TestAnthropicBackend(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(warnings="ignore")
+    unittest.main()
diff --git a/test/lang/test_openai_backend.py b/test/lang/test_openai_backend.py
index b1bb47b82..220784ab3 100644
--- a/test/lang/test_openai_backend.py
+++ b/test/lang/test_openai_backend.py
@@ -88,11 +88,4 @@ class TestOpenAIBackend(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(warnings="ignore")
-
-    # from sglang.global_config import global_config
-
-    # global_config.verbosity = 2
-    # t = TestOpenAIBackend()
-    # t.setUpClass()
-    # t.test_stream()
+    unittest.main()
diff --git a/test/lang/test_srt_backend.py b/test/lang/test_srt_backend.py
index 778cde8be..b2a07ae36 100644
--- a/test/lang/test_srt_backend.py
+++ b/test/lang/test_srt_backend.py
@@ -61,12 +61,4 @@ class TestSRTBackend(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(warnings="ignore")
-
-    # from sglang.global_config import global_config
-
-    # global_config.verbosity = 2
-    # t = TestSRTBackend()
-    # t.setUpClass()
-    # t.test_few_shot_qa()
-    # t.tearDownClass()
+    unittest.main()
diff --git a/test/lang/test_tracing.py b/test/lang/test_tracing.py
index 5f2bc1d04..7c3af071b 100644
--- a/test/lang/test_tracing.py
+++ b/test/lang/test_tracing.py
@@ -125,7 +125,4 @@ class TestTracing(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(warnings="ignore")
-
-    # t = TestTracing()
-    # t.test_multi_function()
+    unittest.main()
diff --git a/test/lang/test_vertexai_backend.py b/test/lang/test_vertexai_backend.py
index b29efaa75..da229854e 100644
--- a/test/lang/test_vertexai_backend.py
+++ b/test/lang/test_vertexai_backend.py
@@ -14,26 +14,22 @@ from sglang.test.test_programs import (
 
 class TestVertexAIBackend(unittest.TestCase):
     backend = None
-    chat_backend = None
-    chat_vision_backend = None
 
     @classmethod
     def setUpClass(cls):
-        cls.backend = VertexAI("gemini-pro")
-        cls.chat_backend = VertexAI("gemini-pro")
-        cls.chat_vision_backend = VertexAI("gemini-pro-vision")
+        cls.backend = VertexAI("gemini-1.5-pro-001")
 
     def test_few_shot_qa(self):
         set_default_backend(self.backend)
         test_few_shot_qa()
 
     def test_mt_bench(self):
-        set_default_backend(self.chat_backend)
+        set_default_backend(self.backend)
         test_mt_bench()
 
     def test_expert_answer(self):
         set_default_backend(self.backend)
-        test_expert_answer()
+        test_expert_answer(check_answer=False)
 
     def test_parallel_decoding(self):
         set_default_backend(self.backend)
@@ -44,7 +40,7 @@ class TestVertexAIBackend(unittest.TestCase):
         test_parallel_encoding()
 
     def test_image_qa(self):
-        set_default_backend(self.chat_vision_backend)
+        set_default_backend(self.backend)
         test_image_qa()
 
     def test_stream(self):
@@ -53,11 +49,4 @@ class TestVertexAIBackend(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(warnings="ignore")
-
-    # from sglang.global_config import global_config
-
-    # global_config.verbosity = 2
-    # t = TestVertexAIBackend()
-    # t.setUpClass()
-    # t.test_stream()
+    unittest.main()
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 2bc37b682..288645c21 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -6,9 +6,9 @@ from sglang.test.test_utils import run_unittest_files
 suites = {
     "minimal": [
         "test_eval_accuracy.py",
-        "test_embedding_openai_server.py",
         "test_openai_server.py",
         "test_vision_openai_server.py",
+        "test_embedding_openai_server.py",
         "test_chunked_prefill.py",
         "test_torch_compile.py",
         "test_models_from_modelscope.py",
diff --git a/test/srt/test_chunked_prefill.py b/test/srt/test_chunked_prefill.py
index 7f274926a..271b73fab 100644
--- a/test/srt/test_chunked_prefill.py
+++ b/test/srt/test_chunked_prefill.py
@@ -37,9 +37,4 @@ class TestAccuracy(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(warnings="ignore")
-
-    # t = TestAccuracy()
-    # t.setUpClass()
-    # t.test_mmlu()
-    # t.tearDownClass()
+    unittest.main()
diff --git a/test/srt/test_embedding_openai_server.py b/test/srt/test_embedding_openai_server.py
index 72dc7a009..ed7db6643 100644
--- a/test/srt/test_embedding_openai_server.py
+++ b/test/srt/test_embedding_openai_server.py
@@ -1,11 +1,8 @@
-import json
-import time
 import unittest
 
 import openai
 
 from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.srt.openai_api.protocol import EmbeddingObject
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import popen_launch_server
 
@@ -65,12 +62,12 @@ class TestOpenAIServer(unittest.TestCase):
         ), f"{response.usage.total_tokens} vs {num_prompt_tokens}"
 
     def run_batch(self):
-        # FIXME not implemented
+        # FIXME: not implemented
         pass
 
     def test_embedding(self):
-        # TODO the fields of encoding_format, dimensions, user are skipped
-        # TODO support use_list_input
+        # TODO: the fields of encoding_format, dimensions, user are skipped
+        # TODO: support use_list_input
         for use_list_input in [False, True]:
             for token_input in [False, True]:
                 self.run_embedding(use_list_input, token_input)
@@ -80,9 +77,4 @@ class TestOpenAIServer(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(warnings="ignore")
-
-    # t = TestOpenAIServer()
-    # t.setUpClass()
-    # t.test_embedding()
-    # t.tearDownClass()
+    unittest.main()
diff --git a/test/srt/test_eval_accuracy.py b/test/srt/test_eval_accuracy.py
index b63593626..da9a4f9c6 100644
--- a/test/srt/test_eval_accuracy.py
+++ b/test/srt/test_eval_accuracy.py
@@ -32,9 +32,4 @@ class TestAccuracy(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(warnings="ignore")
-
-    # t = TestAccuracy()
-    # t.setUpClass()
-    # t.test_mmlu()
-    # t.tearDownClass()
+    unittest.main()
diff --git a/test/srt/test_models_from_modelscope.py b/test/srt/test_models_from_modelscope.py
index 2313053b9..76853c2a6 100644
--- a/test/srt/test_models_from_modelscope.py
+++ b/test/srt/test_models_from_modelscope.py
@@ -44,4 +44,4 @@ class TestDownloadFromModelScope(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(warnings="ignore")
+    unittest.main()
diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py
index f8f6ca632..95486d70e 100644
--- a/test/srt/test_openai_server.py
+++ b/test/srt/test_openai_server.py
@@ -399,9 +399,4 @@ class TestOpenAIServer(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(warnings="ignore")
-
-    # t = TestOpenAIServer()
-    # t.setUpClass()
-    # t.test_completion()
-    # t.tearDownClass()
+    unittest.main()
diff --git a/test/srt/test_skip_tokenizer_srt.py b/test/srt/test_skip_tokenizer_init.py
similarity index 73%
rename from test/srt/test_skip_tokenizer_srt.py
rename to test/srt/test_skip_tokenizer_init.py
index 7f0a1fe1a..7417783f6 100644
--- a/test/srt/test_skip_tokenizer_srt.py
+++ b/test/srt/test_skip_tokenizer_init.py
@@ -1,18 +1,13 @@
 import json
-import os
-import sys
 import unittest
 
 import requests
 
 from sglang.srt.utils import kill_child_process
-from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
 
-# os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 
-
-class TestSRTEndpoint(unittest.TestCase):
+class TestSkipTokenizerInit(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
@@ -26,9 +21,7 @@ class TestSRTEndpoint(unittest.TestCase):
     def tearDownClass(cls):
         kill_child_process(cls.process.pid)
 
-    def run_decode(
-        self, return_logprob=False, top_logprobs_num=0, return_text=False, n=1
-    ):
+    def run_decode(self, return_logprob=False, top_logprobs_num=0, n=1):
         response = requests.post(
             self.base_url + "/generate",
             json={
@@ -50,7 +43,6 @@ class TestSRTEndpoint(unittest.TestCase):
                 "stream": False,
                 "return_logprob": return_logprob,
                 "top_logprobs_num": top_logprobs_num,
-                "return_text_in_logprobs": return_text,
                 "logprob_start_len": 0,
             },
         )
@@ -65,13 +57,11 @@ class TestSRTEndpoint(unittest.TestCase):
 
     def test_logprob(self):
         for top_logprobs_num in [0, 3]:
-            for return_text in [False, False]:
-                self.run_decode(
-                    return_logprob=True,
-                    top_logprobs_num=top_logprobs_num,
-                    return_text=return_text,
-                )
+            self.run_decode(
+                return_logprob=True,
+                top_logprobs_num=top_logprobs_num,
+            )
 
 
 if __name__ == "__main__":
-    unittest.main(warnings="ignore")
+    unittest.main()
diff --git a/test/srt/test_srt_endpoint.py b/test/srt/test_srt_endpoint.py
index b208dfa13..8948e22d7 100644
--- a/test/srt/test_srt_endpoint.py
+++ b/test/srt/test_srt_endpoint.py
@@ -4,7 +4,6 @@ import unittest
 import requests
 
 from sglang.srt.utils import kill_child_process
-from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
 
 
@@ -59,4 +58,4 @@ class TestSRTEndpoint(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(warnings="ignore")
+    unittest.main()
diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py
index fd2c6ebb7..7b4664563 100644
--- a/test/srt/test_torch_compile.py
+++ b/test/srt/test_torch_compile.py
@@ -34,9 +34,4 @@ class TestAccuracy(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(warnings="ignore")
-
-    # t = TestAccuracy()
-    # t.setUpClass()
-    # t.test_mmlu()
-    # t.tearDownClass()
+    unittest.main()
diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py
index 982c026db..52764b6b4 100644
--- a/test/srt/test_vision_openai_server.py
+++ b/test/srt/test_vision_openai_server.py
@@ -113,9 +113,4 @@ class TestOpenAIVisionServer(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    unittest.main(warnings="ignore")
-
-    # t = TestOpenAIVisionServer()
-    # t.setUpClass()
-    # t.test_chat_completion()
-    # t.tearDownClass()
+    unittest.main()