Support v1/responses and use harmony in serving_chat (#8837)

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
Co-authored-by: Xinyuan Tong <justinning0323@outlook.com>
Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
This commit is contained in:
Chang Su
2025-08-06 16:20:34 -07:00
committed by GitHub
parent cbbd685a46
commit 92cc32d9fc
16 changed files with 2878 additions and 43 deletions

View File

@@ -41,6 +41,7 @@ import tempfile
import threading
import time
import traceback
import uuid
import warnings
from collections import OrderedDict, defaultdict
from contextlib import contextmanager
@@ -233,6 +234,10 @@ def is_flashinfer_available():
return importlib.util.find_spec("flashinfer") is not None and is_cuda()
def random_uuid() -> str:
return str(uuid.uuid4().hex)
_ENABLE_TORCH_INFERENCE_MODE = get_bool_env_var(
"SGLANG_ENABLE_TORCH_INFERENCE_MODE", "false"
)