[NVIDIA] disable chunked prefix cache when dp and blackwell is used (#9861)
This commit is contained in:
@@ -525,6 +525,17 @@ class ModelRunner:
|
|||||||
|
|
||||||
if not self.use_mla_backend:
|
if not self.use_mla_backend:
|
||||||
server_args.disable_chunked_prefix_cache = True
|
server_args.disable_chunked_prefix_cache = True
|
||||||
|
# TODO(kaixih@nvidia): remove this once we have a better solution for DP attention.
|
||||||
|
# For more details, see: https://github.com/sgl-project/sglang/issues/8616
|
||||||
|
elif (
|
||||||
|
self.dp_size > 1
|
||||||
|
and is_sm100_supported()
|
||||||
|
and server_args.attention_backend != "triton"
|
||||||
|
):
|
||||||
|
logger.info(
|
||||||
|
"Disable chunked prefix cache when dp size > 1 and attention backend is not triton."
|
||||||
|
)
|
||||||
|
server_args.disable_chunked_prefix_cache = True
|
||||||
|
|
||||||
if not server_args.disable_chunked_prefix_cache:
|
if not server_args.disable_chunked_prefix_cache:
|
||||||
logger.info("Chunked prefix cache is turned on.")
|
logger.info("Chunked prefix cache is turned on.")
|
||||||
|
|||||||
Reference in New Issue
Block a user