[NVIDIA] disable chunked prefix cache when dp and blackwell is used (#9861)

2025-09-05 23:05:16 -07:00
parent 9a719b7afc
commit 90dfe3de4c
1 changed files with 11 additions and 0 deletions
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -525,6 +525,17 @@ class ModelRunner:
        if not self.use_mla_backend:
            server_args.disable_chunked_prefix_cache = True
        # TODO(kaixih@nvidia): remove this once we have a better solution for DP attention.
        #  For more details, see: https://github.com/sgl-project/sglang/issues/8616
        elif (
            self.dp_size > 1
            and is_sm100_supported()
            and server_args.attention_backend != "triton"
        ):
            logger.info(
                "Disable chunked prefix cache when dp size > 1 and attention backend is not triton."
            )
            server_args.disable_chunked_prefix_cache = True
        if not server_args.disable_chunked_prefix_cache:
            logger.info("Chunked prefix cache is turned on.")