Support DP attention with GPT-OSS (#9359)

This commit is contained in:
Nicolas Castet
2025-08-20 18:36:31 -05:00
committed by GitHub
parent d4bce29721
commit c10b8e6a0f
2 changed files with 6 additions and 5 deletions

View File

@@ -1091,7 +1091,7 @@ class GptOssForCausalLM(nn.Module):
if name in params_dict.keys():
param = params_dict[name]
if "sinks" in name:
start = tp_rank * param.numel()
start = get_attention_tp_rank() * param.numel()
param.data.copy_(
loaded_weight[start : start + param.numel()]
)

View File

@@ -2183,10 +2183,11 @@ class ServerArgs:
), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
if is_sm100_supported():
self.enable_flashinfer_allreduce_fusion = True
logger.info(
"Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
)
if not self.enable_dp_attention:
self.enable_flashinfer_allreduce_fusion = True
logger.info(
"Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
)
quantization_config = getattr(hf_config, "quantization_config", None)
is_mxfp4_quant_format = (
quantization_config is not None