Support penalty in overlap mode; return logprob with chunked prefill; improve benchmark scripts (#3988)

Co-authored-by: SangBin Cho <rkooo567@gmail.com> Co-authored-by: dhou-xai <dhou@x.ai> Co-authored-by: Hanming Lu <hanming_lu@berkeley.edu>
2025-03-03 00:12:04 -08:00
parent 0194948fd9
commit ac2387279e
86 changed files with 4116 additions and 2015 deletions
--- a/test/srt/test_health_check.py
+++ b/test/srt/test_health_check.py
@@ -0,0 +1,27 @@
+import unittest
+
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+class TestHealthCheck(unittest.TestCase):
+    def test_health_check(self):
+        """Test that metrics endpoint returns data when enabled"""
+        with self.assertRaises(TimeoutError):
+            popen_launch_server(
+                DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+                DEFAULT_URL_FOR_TEST,
+                timeout=60,
+                other_args=[
+                    "--disable-cuda-graph",
+                    "--json-model-override-args",
+                    '{"architectures": ["LlamaForCausalLMForHealthTest"]}',
+                ],
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()