From c10b8e6a0f2a32ca156c09ecce795454336c4774 Mon Sep 17 00:00:00 2001 From: Nicolas Castet <26874160+nvcastet@users.noreply.github.com> Date: Wed, 20 Aug 2025 18:36:31 -0500 Subject: [PATCH] Support DP attention with GPT-OSS (#9359) --- python/sglang/srt/models/gpt_oss.py | 2 +- python/sglang/srt/server_args.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/python/sglang/srt/models/gpt_oss.py b/python/sglang/srt/models/gpt_oss.py index ff34f1eea..f3734d735 100644 --- a/python/sglang/srt/models/gpt_oss.py +++ b/python/sglang/srt/models/gpt_oss.py @@ -1091,7 +1091,7 @@ class GptOssForCausalLM(nn.Module): if name in params_dict.keys(): param = params_dict[name] if "sinks" in name: - start = tp_rank * param.numel() + start = get_attention_tp_rank() * param.numel() param.data.copy_( loaded_weight[start : start + param.numel()] ) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 36606e97a..326b67e37 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -2183,10 +2183,11 @@ class ServerArgs: ), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'" if is_sm100_supported(): - self.enable_flashinfer_allreduce_fusion = True - logger.info( - "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM" - ) + if not self.enable_dp_attention: + self.enable_flashinfer_allreduce_fusion = True + logger.info( + "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM" + ) quantization_config = getattr(hf_config, "quantization_config", None) is_mxfp4_quant_format = ( quantization_config is not None