Fix mini_lb for PD with long output: limit chunk size of decode response (#7301)
Signed-off-by: ch-tiger1 <xyz@ch-tech.ip-ddns.com> Co-authored-by: ch-tiger1 <xyz@ch-tech.ip-ddns.com>
This commit is contained in:
@@ -18,6 +18,10 @@ from fastapi.responses import ORJSONResponse, Response, StreamingResponse
|
|||||||
|
|
||||||
from sglang.srt.disaggregation.utils import PDRegistryRequest
|
from sglang.srt.disaggregation.utils import PDRegistryRequest
|
||||||
|
|
||||||
|
AIOHTTP_STREAM_READ_CHUNK_SIZE = (
|
||||||
|
1024 * 64
|
||||||
|
) # 64KB, to prevent aiohttp's "Chunk too big" error
|
||||||
|
|
||||||
|
|
||||||
def setup_logger():
|
def setup_logger():
|
||||||
logger = logging.getLogger("pdlb")
|
logger = logging.getLogger("pdlb")
|
||||||
@@ -154,7 +158,9 @@ class MiniLoadBalancer:
|
|||||||
else:
|
else:
|
||||||
yield chunk
|
yield chunk
|
||||||
else:
|
else:
|
||||||
async for chunk in decode_response.content:
|
async for chunk in decode_response.content.iter_chunked(
|
||||||
|
AIOHTTP_STREAM_READ_CHUNK_SIZE
|
||||||
|
):
|
||||||
yield chunk
|
yield chunk
|
||||||
|
|
||||||
return StreamingResponse(
|
return StreamingResponse(
|
||||||
|
|||||||
Reference in New Issue
Block a user