From 2ae809c5c1cc53aee5ca7b5f6e20e234bcd23481 Mon Sep 17 00:00:00 2001 From: ch-tiger1 Date: Thu, 19 Jun 2025 01:46:46 +0800 Subject: [PATCH] Fix mini_lb for PD with long output: limit chunk size of decode response (#7301) Signed-off-by: ch-tiger1 Co-authored-by: ch-tiger1 --- python/sglang/srt/disaggregation/mini_lb.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/disaggregation/mini_lb.py b/python/sglang/srt/disaggregation/mini_lb.py index c7e0a2089..8e3371c73 100644 --- a/python/sglang/srt/disaggregation/mini_lb.py +++ b/python/sglang/srt/disaggregation/mini_lb.py @@ -18,6 +18,10 @@ from fastapi.responses import ORJSONResponse, Response, StreamingResponse from sglang.srt.disaggregation.utils import PDRegistryRequest +AIOHTTP_STREAM_READ_CHUNK_SIZE = ( + 1024 * 64 +) # 64KB, to prevent aiohttp's "Chunk too big" error + def setup_logger(): logger = logging.getLogger("pdlb") @@ -154,7 +158,9 @@ class MiniLoadBalancer: else: yield chunk else: - async for chunk in decode_response.content: + async for chunk in decode_response.content.iter_chunked( + AIOHTTP_STREAM_READ_CHUNK_SIZE + ): yield chunk return StreamingResponse(