[router] Add gRPC E2E test suite (#11790)

2025-10-21 17:51:21 -07:00
parent 70f6309cd4
commit 63cfe1b032
13 changed files with 3331 additions and 5 deletions
--- a/sgl-router/py_test/e2e_grpc/features/test_enable_thinking.py
+++ b/sgl-router/py_test/e2e_grpc/features/test_enable_thinking.py
@@ -0,0 +1,204 @@
+"""
+Usage:
+python3 -m unittest openai_server.features.test_enable_thinking.TestEnableThinking.test_chat_completion_with_reasoning
+python3 -m unittest openai_server.features.test_enable_thinking.TestEnableThinking.test_chat_completion_without_reasoning
+python3 -m unittest openai_server.features.test_enable_thinking.TestEnableThinking.test_stream_chat_completion_with_reasoning
+python3 -m unittest openai_server.features.test_enable_thinking.TestEnableThinking.test_stream_chat_completion_without_reasoning
+"""
+
+import asyncio
+import json
+import os
+import sys
+import time
+import unittest
+
+# CHANGE: Import router launcher instead of server launcher
+from pathlib import Path
+
+import openai
+import requests
+
+_TEST_DIR = Path(__file__).parent
+sys.path.insert(0, str(_TEST_DIR.parent))
+from fixtures import popen_launch_workers_and_router
+from util import (
+    DEFAULT_ENABLE_THINKING_MODEL_PATH,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    get_tokenizer,
+    kill_process_tree,
+)
+
+
+class TestEnableThinking(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        # CHANGE: Launch gRPC router with integrated workers (single command)
+        cls.model = DEFAULT_ENABLE_THINKING_MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-1234"
+        cls.cluster = popen_launch_workers_and_router(
+            cls.model,
+            cls.base_url,
+            timeout=120,
+            api_key=cls.api_key,
+            router_args=[
+                "--reasoning-parser",
+                "qwen3",
+            ],
+            num_workers=1,
+            tp_size=4,
+        )
+        cls.additional_chat_kwargs = {}
+
+    @classmethod
+    def tearDownClass(cls):
+        # Cleanup router and workers
+        kill_process_tree(cls.cluster["router"].pid)
+        for worker in cls.cluster.get("workers", []):
+            kill_process_tree(worker.pid)
+
+    def test_chat_completion_with_reasoning(self):
+        # Test non-streaming with "enable_thinking": True, reasoning_content should not be empty
+        client = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            json={
+                "model": self.model,
+                "messages": [{"role": "user", "content": "Hello"}],
+                "temperature": 0,
+                "separate_reasoning": True,
+                "chat_template_kwargs": {"enable_thinking": True},
+                **self.additional_chat_kwargs,
+            },
+        )
+
+        self.assertEqual(client.status_code, 200, f"Failed with: {client.text}")
+        data = client.json()
+
+        self.assertIn("choices", data)
+        self.assertTrue(len(data["choices"]) > 0)
+        self.assertIn("message", data["choices"][0])
+        self.assertIn("reasoning_content", data["choices"][0]["message"])
+        self.assertIsNotNone(data["choices"][0]["message"]["reasoning_content"])
+
+    def test_chat_completion_without_reasoning(self):
+        # Test non-streaming with "enable_thinking": False, reasoning_content should be empty
+        client = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            json={
+                "model": self.model,
+                "messages": [{"role": "user", "content": "Hello"}],
+                "temperature": 0,
+                "separate_reasoning": True,
+                "chat_template_kwargs": {"enable_thinking": False},
+                **self.additional_chat_kwargs,
+            },
+        )
+
+        self.assertEqual(client.status_code, 200, f"Failed with: {client.text}")
+        data = client.json()
+
+        self.assertIn("choices", data)
+        self.assertTrue(len(data["choices"]) > 0)
+        self.assertIn("message", data["choices"][0])
+
+        if "reasoning_content" in data["choices"][0]["message"]:
+            self.assertIsNone(data["choices"][0]["message"]["reasoning_content"])
+
+    def test_stream_chat_completion_with_reasoning(self):
+        # Test streaming with "enable_thinking": True, reasoning_content should not be empty
+        response = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            json={
+                "model": self.model,
+                "messages": [{"role": "user", "content": "Hello"}],
+                "temperature": 0,
+                "separate_reasoning": True,
+                "stream": True,
+                "chat_template_kwargs": {"enable_thinking": True},
+                **self.additional_chat_kwargs,
+            },
+            stream=True,
+        )
+
+        self.assertEqual(response.status_code, 200, f"Failed with: {response.text}")
+
+        has_reasoning = False
+        has_content = False
+
+        print("\n=== Stream With Reasoning ===")
+        for line in response.iter_lines():
+            if line:
+                line = line.decode("utf-8")
+                if line.startswith("data:") and not line.startswith("data: [DONE]"):
+                    data = json.loads(line[6:])
+                    if "choices" in data and len(data["choices"]) > 0:
+                        delta = data["choices"][0].get("delta", {})
+
+                        if "reasoning_content" in delta and delta["reasoning_content"]:
+                            has_reasoning = True
+
+                        if "content" in delta and delta["content"]:
+                            has_content = True
+
+        self.assertTrue(
+            has_reasoning,
+            "The reasoning content is not included in the stream response",
+        )
+        self.assertTrue(
+            has_content, "The stream response does not contain normal content"
+        )
+
+    def test_stream_chat_completion_without_reasoning(self):
+        # Test streaming with "enable_thinking": False, reasoning_content should  be empty
+        response = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            json={
+                "model": self.model,
+                "messages": [{"role": "user", "content": "Hello"}],
+                "temperature": 0,
+                "separate_reasoning": True,
+                "stream": True,
+                "chat_template_kwargs": {"enable_thinking": False},
+                **self.additional_chat_kwargs,
+            },
+            stream=True,
+        )
+
+        self.assertEqual(response.status_code, 200, f"Failed with: {response.text}")
+
+        has_reasoning = False
+        has_content = False
+
+        print("\n=== Stream Without Reasoning ===")
+        for line in response.iter_lines():
+            if line:
+                line = line.decode("utf-8")
+                if line.startswith("data:") and not line.startswith("data: [DONE]"):
+                    data = json.loads(line[6:])
+                    if "choices" in data and len(data["choices"]) > 0:
+                        delta = data["choices"][0].get("delta", {})
+
+                        if "reasoning_content" in delta and delta["reasoning_content"]:
+                            has_reasoning = True
+
+                        if "content" in delta and delta["content"]:
+                            has_content = True
+
+        self.assertFalse(
+            has_reasoning,
+            "The reasoning content should not be included in the stream response",
+        )
+        self.assertTrue(
+            has_content, "The stream response does not contain normal content"
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/sgl-router/py_test/e2e_grpc/features/test_reasoning_content.py
+++ b/sgl-router/py_test/e2e_grpc/features/test_reasoning_content.py
@@ -0,0 +1,198 @@
+"""
+Usage:
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_false
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_true
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_true_stream_reasoning_false
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentAPI.test_nonstreaming_separate_reasoning_false
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentAPI.test_nonstreaming_separate_reasoning_true
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentStartup.test_nonstreaming
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentStartup.test_streaming
+"""
+
+import json
+
+# CHANGE: Import router launcher instead of server launcher
+import sys
+import unittest
+from pathlib import Path
+
+import openai
+import requests
+
+_TEST_DIR = Path(__file__).parent
+sys.path.insert(0, str(_TEST_DIR.parent))
+from fixtures import popen_launch_workers_and_router
+from util import (
+    DEFAULT_REASONING_MODEL_PATH,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    kill_process_tree,
+)
+
+
+class TestReasoningContentAPI(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        # CHANGE: Launch gRPC router with integrated workers (single command)
+        cls.model = DEFAULT_REASONING_MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-1234"
+        cls.cluster = popen_launch_workers_and_router(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            router_args=[
+                "--reasoning-parser",
+                "deepseek_r1",
+            ],
+            num_workers=1,
+            tp_size=2,
+        )
+        cls.base_url += "/v1"
+
+    @classmethod
+    def tearDownClass(cls):
+        # Cleanup router and workers
+        kill_process_tree(cls.cluster["router"].pid)
+        for worker in cls.cluster.get("workers", []):
+            kill_process_tree(worker.pid)
+
+    def test_streaming_separate_reasoning_false(self):
+        # Test streaming with separate_reasoning=False, reasoning_content should be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "stream": True,
+            "extra_body": {"separate_reasoning": False},
+        }
+        response = client.chat.completions.create(**payload)
+
+        reasoning_content = ""
+        content = ""
+        for chunk in response:
+            if chunk.choices[0].delta.content:
+                content += chunk.choices[0].delta.content
+            elif chunk.choices[0].delta.reasoning_content:
+                reasoning_content += chunk.choices[0].delta.reasoning_content
+
+        assert len(reasoning_content) == 0
+        assert len(content) > 0
+
+    def test_streaming_separate_reasoning_true(self):
+        # Test streaming with separate_reasoning=True, reasoning_content should not be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "stream": True,
+            "extra_body": {"separate_reasoning": True},
+        }
+        response = client.chat.completions.create(**payload)
+
+        reasoning_content = ""
+        content = ""
+        for chunk in response:
+            if chunk.choices[0].delta.content:
+                content += chunk.choices[0].delta.content
+            elif chunk.choices[0].delta.reasoning_content:
+                reasoning_content += chunk.choices[0].delta.reasoning_content
+
+        assert len(reasoning_content) > 0
+        assert len(content) > 0
+
+    def test_streaming_separate_reasoning_true_stream_reasoning_false(self):
+        # Test streaming with separate_reasoning=True, reasoning_content should not be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "stream": True,
+            "extra_body": {"separate_reasoning": True, "stream_reasoning": False},
+        }
+        response = client.chat.completions.create(**payload)
+
+        reasoning_content = ""
+        content = ""
+        first_chunk = False
+        for chunk in response:
+            if chunk.choices[0].delta.reasoning_content:
+                reasoning_content = chunk.choices[0].delta.reasoning_content
+                first_chunk = True
+            if chunk.choices[0].delta.content:
+                content += chunk.choices[0].delta.content
+                if not first_chunk:
+                    reasoning_content = chunk.choices[0].delta.reasoning_content
+                first_chunk = True
+            if not first_chunk:
+                assert (
+                    not chunk.choices[0].delta.reasoning_content
+                    or len(chunk.choices[0].delta.reasoning_content) == 0
+                )
+        assert len(reasoning_content) > 0
+        assert len(content) > 0
+
+    def test_nonstreaming_separate_reasoning_false(self):
+        # Test non-streaming with separate_reasoning=False, reasoning_content should be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "extra_body": {"separate_reasoning": False},
+        }
+        response = client.chat.completions.create(**payload)
+
+        assert (
+            not response.choices[0].message.reasoning_content
+            or len(response.choices[0].message.reasoning_content) == 0
+        )
+        assert len(response.choices[0].message.content) > 0
+
+    def test_nonstreaming_separate_reasoning_true(self):
+        # Test non-streaming with separate_reasoning=True, reasoning_content should not be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "extra_body": {"separate_reasoning": True},
+        }
+        response = client.chat.completions.create(**payload)
+
+        assert len(response.choices[0].message.reasoning_content) > 0
+        assert len(response.choices[0].message.content) > 0
+
+
+if __name__ == "__main__":
+    unittest.main()