Improve structured outputs: fix race condition, server crash, metrics and style (#6188)

2025-05-11 08:36:16 -07:00
parent 94d42b6794
commit 01bdbf7f80
13 changed files with 568 additions and 258 deletions
--- a/test/srt/test_json_constrained.py
+++ b/test/srt/test_json_constrained.py
@@ -82,7 +82,7 @@ class TestJSONConstrainedOutlinesBackend(CustomTestCase):
        print(json.dumps(ret))
        print("=" * 100)

-        if not json_schema:
+        if not json_schema or json_schema == "INVALID":
            return

        # Make sure the json output is valid
@@ -97,6 +97,9 @@ class TestJSONConstrainedOutlinesBackend(CustomTestCase):
    def test_json_generate(self):
        self.run_decode(json_schema=self.json_schema)

+    def test_json_invalid(self):
+        self.run_decode(json_schema="INVALID")
+
    def test_json_openai(self):
        client = openai.Client(api_key="EMPTY", base_url=f"{self.base_url}/v1")

@@ -104,7 +107,10 @@ class TestJSONConstrainedOutlinesBackend(CustomTestCase):
            model=self.model,
            messages=[
                {"role": "system", "content": "You are a helpful AI assistant"},
-                {"role": "user", "content": "Introduce the capital of France."},
+                {
+                    "role": "user",
+                    "content": "Introduce the capital of France. Return in a JSON format.",
+                },
            ],
            temperature=0,
            max_tokens=128,
--- a/test/srt/test_metrics.py
+++ b/test/srt/test_metrics.py
@@ -56,6 +56,7 @@ class TestEnableMetrics(CustomTestCase):
                "sglang:token_usage",
                "sglang:gen_throughput",
                "sglang:num_queue_reqs",
+                "sglang:num_grammar_queue_reqs",
                "sglang:cache_hit_rate",
                "sglang:spec_accept_length",
                "sglang:prompt_tokens_total",