From e5db40dcbce67157e005f524bf6a5bea7dcb7f34 Mon Sep 17 00:00:00 2001 From: Michael Feil <63565275+michaelfeil@users.noreply.github.com> Date: Thu, 17 Oct 2024 08:03:08 -0700 Subject: [PATCH] ORJson. Faster Json serialization (#1694) --- python/sglang/srt/server.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index dea4a7d04..644cb2b8a 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -28,7 +28,9 @@ import os import threading import time from http import HTTPStatus -from typing import Dict, List, Optional, Union +from typing import AsyncIterator, Dict, List, Optional, Union + +import orjson # Fix a bug of Python threading setattr(threading, "_register_atexit", lambda *args, **kwargs: None) @@ -192,14 +194,18 @@ async def generate_request(obj: GenerateReqInput, request: Request): """Handle a generate request.""" if obj.stream: - async def stream_results(): + async def stream_results() -> AsyncIterator[bytes]: try: async for out in tokenizer_manager.generate_request(obj, request): - yield f"data: {json.dumps(out, ensure_ascii=False)}\n\n" + yield b"data: " + orjson.dumps( + out, option=orjson.OPT_NON_STR_KEYS + ) + b"\n\n" except ValueError as e: out = {"error": {"message": str(e)}} - yield f"data: {json.dumps(out, ensure_ascii=False)}\n\n" - yield "data: [DONE]\n\n" + yield b"data: " + orjson.dumps( + out, option=orjson.OPT_NON_STR_KEYS + ) + b"\n\n" + yield b"data: [DONE]\n\n" return StreamingResponse( stream_results(), @@ -260,13 +266,13 @@ async def openai_v1_chat_completions(raw_request: Request): return await v1_chat_completions(tokenizer_manager, raw_request) -@app.post("/v1/embeddings") +@app.post("/v1/embeddings", response_class=ORJSONResponse) async def openai_v1_embeddings(raw_request: Request): response = await v1_embeddings(tokenizer_manager, raw_request) return response -@app.get("/v1/models") +@app.get("/v1/models", response_class=ORJSONResponse) def available_models(): """Show available models.""" served_model_names = [tokenizer_manager.served_model_name]