[gpt-oss] Add gpt-oss bf16 support

2025-08-13 21:25:57 +08:00
parent 5d2e7edf78
commit 8ba49a7723
9 changed files with 777 additions and 36 deletions
--- a/model_executor/layers/fused_moe/layer.py
+++ b/model_executor/layers/fused_moe/layer.py
@@ -226,6 +226,8 @@ class MoEConfig:

    max_num_tokens: int = MOE_DP_CHUNK_SIZE

+    has_bias: bool = False
+
    @property
    def tp_size(self):
        return self.moe_parallel_config.tp_size
@@ -443,6 +445,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
        self.fused_experts = fused_experts  # type: ignore
        self.topk_indices_dtype = None
        self.moe = moe
+        self.has_bias = self.moe.has_bias

        self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
        if self.rocm_aiter_moe_enabled:
@@ -502,6 +505,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                                        requires_grad=False)
        layer.register_parameter("w13_weight", w13_weight)
        set_weight_attrs(w13_weight, extra_weight_attrs)
+        if self.has_bias:
+            w13_bias = torch.nn.Parameter(torch.zeros(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                dtype=params_dtype),
+                                          requires_grad=False)
+            layer.register_parameter("w13_bias", w13_bias)
+            set_weight_attrs(w13_bias, extra_weight_attrs)

        # down_proj (row parallel)
        w2_weight = torch.nn.Parameter(torch.empty(
@@ -512,6 +523,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                                       requires_grad=False)
        layer.register_parameter("w2_weight", w2_weight)
        set_weight_attrs(w2_weight, extra_weight_attrs)
+        if self.has_bias:
+            w2_bias = torch.nn.Parameter(torch.zeros(num_experts,
+                                                     hidden_size,
+                                                     dtype=params_dtype),
+                                         requires_grad=False)
+            layer.register_parameter("w2_bias", w2_bias)
+            set_weight_attrs(w2_bias, extra_weight_attrs)

    def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
        # Pad the weight tensor. This is an optimization on ROCm platform, which
@@ -634,6 +652,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                hidden_states=x,
                w1=layer.w13_weight,
                w2=layer.w2_weight,
+                w1_bias=layer.w13_bias if self.has_bias else None,
+                w2_bias=layer.w2_bias if self.has_bias else None,
                topk_weights=topk_weights,
                topk_ids=topk_ids,
                inplace=True,
@@ -840,6 +860,7 @@ class FusedMoE(torch.nn.Module):
        e_score_correction_bias: Optional[torch.Tensor] = None,
        apply_router_weight_on_input: bool = False,
        activation: str = "silu",
+        has_bias: bool = False,
    ):
        super().__init__()
        if params_dtype is None:
@@ -920,6 +941,7 @@ class FusedMoE(torch.nn.Module):
            in_dtype=params_dtype,
            quant_dtype=quant_dtype,
            max_num_tokens=MOE_DP_CHUNK_SIZE,
+            has_bias=has_bias,
        )
        self.moe_config = moe
        self.quant_config = quant_config