From 2ae809c5c1cc53aee5ca7b5f6e20e234bcd23481 Mon Sep 17 00:00:00 2001
From: ch-tiger1 <tiger@ch-tech.ip-ddns.com>
Date: Thu, 19 Jun 2025 01:46:46 +0800
Subject: [PATCH] Fix mini_lb for PD with long output: limit chunk size of
 decode response (#7301)

Signed-off-by: ch-tiger1 <xyz@ch-tech.ip-ddns.com>
Co-authored-by: ch-tiger1 <xyz@ch-tech.ip-ddns.com>
---
 python/sglang/srt/disaggregation/mini_lb.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/disaggregation/mini_lb.py b/python/sglang/srt/disaggregation/mini_lb.py
index c7e0a2089..8e3371c73 100644
--- a/python/sglang/srt/disaggregation/mini_lb.py
+++ b/python/sglang/srt/disaggregation/mini_lb.py
@@ -18,6 +18,10 @@ from fastapi.responses import ORJSONResponse, Response, StreamingResponse
 
 from sglang.srt.disaggregation.utils import PDRegistryRequest
 
+AIOHTTP_STREAM_READ_CHUNK_SIZE = (
+    1024 * 64
+)  # 64KB, to prevent aiohttp's "Chunk too big" error
+
 
 def setup_logger():
     logger = logging.getLogger("pdlb")
@@ -154,7 +158,9 @@ class MiniLoadBalancer:
                         else:
                             yield chunk
                 else:
-                    async for chunk in decode_response.content:
+                    async for chunk in decode_response.content.iter_chunked(
+                        AIOHTTP_STREAM_READ_CHUNK_SIZE
+                    ):
                         yield chunk
 
         return StreamingResponse(