Fix mini_lb for PD with long output: limit chunk size of decode response (#7301)

Signed-off-by: ch-tiger1 <xyz@ch-tech.ip-ddns.com>
Co-authored-by: ch-tiger1 <xyz@ch-tech.ip-ddns.com>
This commit is contained in:
ch-tiger1
2025-06-19 01:46:46 +08:00
committed by GitHub
parent 1de4db9bef
commit 2ae809c5c1

View File

@@ -18,6 +18,10 @@ from fastapi.responses import ORJSONResponse, Response, StreamingResponse
from sglang.srt.disaggregation.utils import PDRegistryRequest from sglang.srt.disaggregation.utils import PDRegistryRequest
AIOHTTP_STREAM_READ_CHUNK_SIZE = (
1024 * 64
) # 64KB, to prevent aiohttp's "Chunk too big" error
def setup_logger(): def setup_logger():
logger = logging.getLogger("pdlb") logger = logging.getLogger("pdlb")
@@ -154,7 +158,9 @@ class MiniLoadBalancer:
else: else:
yield chunk yield chunk
else: else:
async for chunk in decode_response.content: async for chunk in decode_response.content.iter_chunked(
AIOHTTP_STREAM_READ_CHUNK_SIZE
):
yield chunk yield chunk
return StreamingResponse( return StreamingResponse(