Fix mini_lb for PD with long output: limit chunk size of decode response (#7301)

Signed-off-by: ch-tiger1 <xyz@ch-tech.ip-ddns.com>
Co-authored-by: ch-tiger1 <xyz@ch-tech.ip-ddns.com>
This commit is contained in:
ch-tiger1
2025-06-19 01:46:46 +08:00
committed by GitHub
parent 1de4db9bef
commit 2ae809c5c1

View File

@@ -18,6 +18,10 @@ from fastapi.responses import ORJSONResponse, Response, StreamingResponse
from sglang.srt.disaggregation.utils import PDRegistryRequest
AIOHTTP_STREAM_READ_CHUNK_SIZE = (
1024 * 64
) # 64KB, to prevent aiohttp's "Chunk too big" error
def setup_logger():
logger = logging.getLogger("pdlb")
@@ -154,7 +158,9 @@ class MiniLoadBalancer:
else:
yield chunk
else:
async for chunk in decode_response.content:
async for chunk in decode_response.content.iter_chunked(
AIOHTTP_STREAM_READ_CHUNK_SIZE
):
yield chunk
return StreamingResponse(