Fix mini_lb for PD with long output: limit chunk size of decode response (#7301)
Signed-off-by: ch-tiger1 <xyz@ch-tech.ip-ddns.com> Co-authored-by: ch-tiger1 <xyz@ch-tech.ip-ddns.com>
This commit is contained in:
@@ -18,6 +18,10 @@ from fastapi.responses import ORJSONResponse, Response, StreamingResponse
|
||||
|
||||
from sglang.srt.disaggregation.utils import PDRegistryRequest
|
||||
|
||||
AIOHTTP_STREAM_READ_CHUNK_SIZE = (
|
||||
1024 * 64
|
||||
) # 64KB, to prevent aiohttp's "Chunk too big" error
|
||||
|
||||
|
||||
def setup_logger():
|
||||
logger = logging.getLogger("pdlb")
|
||||
@@ -154,7 +158,9 @@ class MiniLoadBalancer:
|
||||
else:
|
||||
yield chunk
|
||||
else:
|
||||
async for chunk in decode_response.content:
|
||||
async for chunk in decode_response.content.iter_chunked(
|
||||
AIOHTTP_STREAM_READ_CHUNK_SIZE
|
||||
):
|
||||
yield chunk
|
||||
|
||||
return StreamingResponse(
|
||||
|
||||
Reference in New Issue
Block a user