Misc fix for min_p_sampling, --cuda-graph-bs (#2761)

This commit is contained in:
Lianmin Zheng
2025-01-07 02:52:53 -08:00
committed by GitHub
parent 6d08ce2aa9
commit bdc1acf6cd
17 changed files with 135 additions and 63 deletions

View File

@@ -127,14 +127,12 @@ async def health() -> Response:
async def health_generate(request: Request) -> Response:
"""Check the health of the inference server by generating one token."""
sampling_params = {"max_new_tokens": 1, "temperature": 0.7}
if tokenizer_manager.is_generation:
gri = GenerateReqInput(
input_ids=[0], sampling_params={"max_new_tokens": 1, "temperature": 0.7}
)
gri = GenerateReqInput(input_ids=[0], sampling_params=sampling_params)
else:
gri = EmbeddingReqInput(
input_ids=[0], sampling_params={"max_new_tokens": 1, "temperature": 0.7}
)
gri = EmbeddingReqInput(input_ids=[0], sampling_params=sampling_params)
try:
async for _ in tokenizer_manager.generate_request(gri, request):