Separate two entry points: Engine and HTTP server (#2996)

Co-authored-by: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
2025-01-19 22:09:24 -08:00
parent 44a9669770
commit 03464890e0
18 changed files with 1126 additions and 1047 deletions
--- a/test/srt/test_metrics.py
+++ b/test/srt/test_metrics.py
@@ -56,7 +56,6 @@ class TestEnableMetrics(unittest.TestCase):
                "sglang:gen_throughput",
                "sglang:num_queue_reqs",
                "sglang:cache_hit_rate",
-                "sglang:func_latency_seconds",
                "sglang:prompt_tokens_total",
                "sglang:generation_tokens_total",
                "sglang:num_requests_total",
--- a/test/srt/test_nightly_gsm8k_eval.py
+++ b/test/srt/test_nightly_gsm8k_eval.py
@@ -45,7 +45,7 @@ def parse_models(model_string):
    return [model.strip() for model in model_string.split(",") if model.strip()]


-def launch_server(base_url, model, is_fp8, is_tp2):
+def popen_launch_server_wrapper(base_url, model, is_fp8, is_tp2):
    other_args = ["--log-level-http", "warning", "--trust-remote-code"]
    if is_fp8:
        if "Llama-3" in model or "gemma-2" in model:
@@ -148,7 +148,9 @@ class TestNightlyGsm8KEval(unittest.TestCase):
        for model_group, is_fp8, is_tp2 in self.model_groups:
            for model in model_group:
                with self.subTest(model=model):
-                    process = launch_server(self.base_url, model, is_fp8, is_tp2)
+                    process = popen_launch_server_wrapper(
+                        self.base_url, model, is_fp8, is_tp2
+                    )

                    args = SimpleNamespace(
                        base_url=self.base_url,
--- a/test/srt/test_nightly_human_eval.py
+++ b/test/srt/test_nightly_human_eval.py
@@ -4,7 +4,7 @@ import signal
 import subprocess
 import unittest

-from test_nightly_gsm8k_eval import launch_server, parse_models
+from test_nightly_gsm8k_eval import parse_models, popen_launch_server_wrapper

 from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
@@ -93,7 +93,7 @@ class TestNightlyHumanEval(unittest.TestCase):
                # NOTE: only Llama for now
                if "Llama" in model:
                    with self.subTest(model=model):
-                        self.process = launch_server(
+                        self.process = popen_launch_server_wrapper(
                            self.base_url, model, is_fp8, is_tp2
                        )
                        self.run_evalplus(model)
--- a/test/srt/test_srt_engine.py
+++ b/test/srt/test_srt_engine.py
@@ -1,6 +1,6 @@
 """
 Usage:
-python3 -m unittest test_srt_engine.TestSRTEngine.test_3_sync_streaming_combination
+python3 -m unittest test_srt_engine.TestSRTEngine.test_4_sync_async_stream_combination
 """

 import asyncio
@@ -44,83 +44,29 @@ class TestSRTEngine(unittest.TestCase):
        print(out2)
        self.assertEqual(out1, out2)

-    def test_2_engine_multiple_generate(self):
+    def test_2_engine_runtime_encode_consistency(self):
+        prompt = "Today is a sunny day and I like"
+        model_path = DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST
+
+        engine = sgl.Engine(model_path=model_path, is_embedding=True, random_seed=42)
+        out1 = torch.tensor(engine.encode(prompt)["embedding"])
+        engine.shutdown()
+
+        runtime = sgl.Runtime(model_path=model_path, is_embedding=True, random_seed=42)
+        out2 = torch.tensor(json.loads(runtime.encode(prompt))["embedding"])
+        runtime.shutdown()
+
+        self.assertTrue(torch.allclose(out1, out2, atol=1e-5, rtol=1e-3))
+
+    def test_3_engine_token_ids_consistency(self):
        # just to ensure there is no issue running multiple generate calls
        prompt = "Today is a sunny day and I like"
        model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
-
        sampling_params = {"temperature": 0, "max_new_tokens": 8}

-        engine = sgl.Engine(model_path=model_path, random_seed=42)
-        engine.generate(prompt, sampling_params)
-        engine.generate(prompt, sampling_params)
-        engine.shutdown()
-
-    def test_3_sync_streaming_combination(self):
-
-        prompt = "AI safety is..."
-        sampling_params = {"temperature": 0.8, "top_p": 0.95}
-
-        async def async_streaming(engine):
-
-            generator = await engine.async_generate(
-                prompt, sampling_params, stream=True
-            )
-
-            async for output in generator:
-                print(output["text"], end="", flush=True)
-            print()
-
-        # Create an LLM.
-        llm = sgl.Engine(
-            model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
-        )
-
-        # 1. sync + non streaming
-        print("\n\n==== 1. sync + non streaming ====")
-        output = llm.generate(prompt, sampling_params)
-
-        print(output["text"])
-
-        # 2. sync + streaming
-        print("\n\n==== 2. sync + streaming ====")
-        output_generator = llm.generate(prompt, sampling_params, stream=True)
-        for output in output_generator:
-            print(output["text"], end="", flush=True)
-        print()
-
-        loop = asyncio.get_event_loop()
-        # 3. async + non_streaming
-        print("\n\n==== 3. async + non streaming ====")
-        output = loop.run_until_complete(llm.async_generate(prompt, sampling_params))
-        print(output["text"])
-
-        # 4. async + streaming
-        print("\n\n==== 4. async + streaming ====")
-        loop.run_until_complete(async_streaming(llm))
-
-        llm.shutdown()
-
-    def test_4_gsm8k(self):
-
-        args = SimpleNamespace(
-            model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
-            local_data_path=None,
-            num_shots=5,
-            num_questions=200,
-        )
-
-        metrics = run_eval(args)
-        self.assertGreater(metrics["accuracy"], 0.3)
-
-    def test_5_prompt_input_ids_consistency(self):
-        prompt = "The capital of UK is"
-
-        model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
        engine = sgl.Engine(
            model_path=model_path, random_seed=42, disable_radix_cache=True
        )
-        sampling_params = {"temperature": 0, "max_new_tokens": 8}
        out1 = engine.generate(prompt, sampling_params)["text"]

        tokenizer = get_tokenizer(model_path)
@@ -138,21 +84,69 @@ class TestSRTEngine(unittest.TestCase):
        print(out2)
        self.assertEqual(out1, out2)

-    def test_6_engine_runtime_encode_consistency(self):
-        prompt = "Today is a sunny day and I like"
-        model_path = DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST
+    def test_4_sync_async_stream_combination(self):
+        prompt = "AI safety is"
+        sampling_params = {"temperature": 0.8, "top_p": 0.95}

-        engine = sgl.Engine(model_path=model_path, is_embedding=True, random_seed=42)
-        out1 = torch.tensor(engine.encode(prompt)["embedding"])
-        engine.shutdown()
+        # Create an LLM.
+        llm = sgl.Engine(
+            model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+        )

-        runtime = sgl.Runtime(model_path=model_path, is_embedding=True, random_seed=42)
-        out2 = torch.tensor(json.loads(runtime.encode(prompt))["embedding"])
-        runtime.shutdown()
+        if True:
+            # 1. sync + non streaming
+            print("\n\n==== 1. sync + non streaming ====")
+            output = llm.generate(prompt, sampling_params)
+            print(output["text"])

-        self.assertTrue(torch.allclose(out1, out2, atol=1e-5, rtol=1e-3))
+            # 2. sync + streaming
+            print("\n\n==== 2. sync + streaming ====")
+            output_generator = llm.generate(prompt, sampling_params, stream=True)
+            offset = 0
+            for output in output_generator:
+                print(output["text"][offset:], end="", flush=True)
+                offset = len(output["text"])
+            print()

-    def test_7_engine_cpu_offload(self):
+        if True:
+            loop = asyncio.get_event_loop()
+            # 3. async + non_streaming
+            print("\n\n==== 3. async + non streaming ====")
+            output = loop.run_until_complete(
+                llm.async_generate(prompt, sampling_params)
+            )
+            print(output["text"])
+
+            # 4. async + streaming
+            async def async_streaming(engine):
+                generator = await engine.async_generate(
+                    prompt, sampling_params, stream=True
+                )
+
+                offset = 0
+                async for output in generator:
+                    print(output["text"][offset:], end="", flush=True)
+                    offset = len(output["text"])
+                print()
+
+            print("\n\n==== 4. async + streaming ====")
+            loop.run_until_complete(async_streaming(llm))
+
+        llm.shutdown()
+
+    def test_5_gsm8k(self):
+
+        args = SimpleNamespace(
+            model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+            local_data_path=None,
+            num_shots=5,
+            num_questions=200,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreater(metrics["accuracy"], 0.3)
+
+    def test_6_engine_cpu_offload(self):
        prompt = "Today is a sunny day and I like"
        model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST

@@ -182,7 +176,7 @@ class TestSRTEngine(unittest.TestCase):
        print(out2)
        self.assertEqual(out1, out2)

-    def test_8_engine_offline_throughput(self):
+    def test_7_engine_offline_throughput(self):
        server_args = ServerArgs(
            model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
        )