Misc fix for min_p_sampling, --cuda-graph-bs (#2761)
This commit is contained in:
@@ -127,14 +127,12 @@ async def health() -> Response:
|
||||
async def health_generate(request: Request) -> Response:
|
||||
"""Check the health of the inference server by generating one token."""
|
||||
|
||||
sampling_params = {"max_new_tokens": 1, "temperature": 0.7}
|
||||
|
||||
if tokenizer_manager.is_generation:
|
||||
gri = GenerateReqInput(
|
||||
input_ids=[0], sampling_params={"max_new_tokens": 1, "temperature": 0.7}
|
||||
)
|
||||
gri = GenerateReqInput(input_ids=[0], sampling_params=sampling_params)
|
||||
else:
|
||||
gri = EmbeddingReqInput(
|
||||
input_ids=[0], sampling_params={"max_new_tokens": 1, "temperature": 0.7}
|
||||
)
|
||||
gri = EmbeddingReqInput(input_ids=[0], sampling_params=sampling_params)
|
||||
|
||||
try:
|
||||
async for _ in tokenizer_manager.generate_request(gri, request):
|
||||
|
||||
Reference in New Issue
Block a user