Misc fix for min_p_sampling, --cuda-graph-bs (#2761)

2025-01-07 02:52:53 -08:00
parent 6d08ce2aa9
commit bdc1acf6cd
17 changed files with 135 additions and 63 deletions
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -127,14 +127,12 @@ async def health() -> Response:
 async def health_generate(request: Request) -> Response:
    """Check the health of the inference server by generating one token."""

+    sampling_params = {"max_new_tokens": 1, "temperature": 0.7}
+
    if tokenizer_manager.is_generation:
-        gri = GenerateReqInput(
-            input_ids=[0], sampling_params={"max_new_tokens": 1, "temperature": 0.7}
-        )
+        gri = GenerateReqInput(input_ids=[0], sampling_params=sampling_params)
    else:
-        gri = EmbeddingReqInput(
-            input_ids=[0], sampling_params={"max_new_tokens": 1, "temperature": 0.7}
-        )
+        gri = EmbeddingReqInput(input_ids=[0], sampling_params=sampling_params)

    try:
        async for _ in tokenizer_manager.generate_request(gri, request):