Support casting bf16 NextN moe to fp8 (#11613)

2025-10-18 08:02:15 +08:00
parent 505329cab0
commit 8af8491298
2 changed files with 93 additions and 3 deletions
--- a/python/sglang/srt/models/deepseek_nextn.py
+++ b/python/sglang/srt/models/deepseek_nextn.py
@@ -25,13 +25,18 @@ from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_r
 from sglang.srt.layers.dp_attention import is_dp_attention_enabled
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization import Fp8Config
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.vocab_parallel_embedding import (
    ParallelLMHead,
    VocabParallelEmbedding,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.models.deepseek_v2 import DeepseekV2DecoderLayer, DeepseekV3ForCausalLM
+from sglang.srt.models.deepseek_v2 import (
    DeepseekV2DecoderLayer,
    DeepseekV3ForCausalLM,
    enable_nextn_moe_bf16_cast_to_fp8,
 )
 from sglang.srt.server_args import get_global_server_args
 from sglang.srt.utils import BumpAllocator, add_prefix, is_cuda
@@ -49,6 +54,16 @@ class DeepseekModelNextN(nn.Module):
        prefix: str = "",
    ) -> None:
        super().__init__()
        if enable_nextn_moe_bf16_cast_to_fp8(quant_config):
            # refer to real DeepSeek V3 quant config
            moe_quant_config = Fp8Config(
                is_checkpoint_fp8_serialized=True,
                weight_block_size=[128, 128],
            )
        else:
            moe_quant_config = None
        if quant_config is not None and quant_config.get_name() == "modelopt_fp4":
            logger.warning(
                "Overriding DeepseekV3ForCausalLMNextN quant config for modelopt_fp4 Deepseek model."
@@ -74,6 +89,7 @@ class DeepseekModelNextN(nn.Module):
            config,
            0,
            quant_config=quant_config,
            moe_quant_config=moe_quant_config,
            is_nextn=True,
            prefix=add_prefix("decoder", prefix),
            alt_stream=self.alt_stream,
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -26,6 +26,7 @@ from typing import Any, Dict, Iterable, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 from torch import nn
 from tqdm import tqdm, trange
 from transformers import PretrainedConfig
 from sglang.srt import single_batch_overlap
@@ -82,7 +83,7 @@ from sglang.srt.layers.moe import (
 from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, get_moe_impl_class
 from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
 from sglang.srt.layers.moe.topk import TopK, TopKOutputFormat
-from sglang.srt.layers.quantization import deep_gemm_wrapper
+from sglang.srt.layers.quantization import Fp8Config, deep_gemm_wrapper
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.quantization.fp8_kernel import (
    is_fp8_fnuz,
@@ -196,6 +197,15 @@ _is_cublas_ge_129 = is_nvidia_cublas_cu12_version_ge_12_9()
 logger = logging.getLogger(__name__)
 def enable_nextn_moe_bf16_cast_to_fp8(quant_config):
    return (
        quant_config is not None
        and quant_config.get_name() == "modelopt_fp4"
        and get_moe_a2a_backend().is_deepep()
    )
 FORWARD_ABSORB_CORE_ATTENTION_BACKENDS = [
    "fa3",
    "nsa",
@@ -526,6 +536,7 @@ class DeepseekV2MoE(nn.Module):
        self.config = config
        self.layer_id = layer_id
        self.alt_stream = alt_stream
        self.is_nextn = is_nextn
        if self.tp_size > config.n_routed_experts:
            raise ValueError(
@@ -2381,6 +2392,7 @@ class DeepseekV2DecoderLayer(nn.Module):
        config: PretrainedConfig,
        layer_id: int,
        quant_config: Optional[QuantizationConfig] = None,
        moe_quant_config: Optional[QuantizationConfig] = None,
        is_nextn: bool = False,
        prefix: str = "",
        alt_stream: Optional[torch.cuda.Stream] = None,
@@ -2430,7 +2442,7 @@ class DeepseekV2DecoderLayer(nn.Module):
        if self.is_layer_sparse:
            self.mlp = DeepseekV2MoE(
                config=config,
-                quant_config=quant_config,
+                quant_config=moe_quant_config or quant_config,
                prefix=add_prefix("mlp", prefix),
                layer_id=self.layer_id,
                alt_stream=alt_stream,
@@ -3109,6 +3121,9 @@ class DeepseekV2ForCausalLM(nn.Module):
        ):
            self._weight_requant_ue8m0(is_nextn)
        if is_nextn and enable_nextn_moe_bf16_cast_to_fp8(self.quant_config):
            self._transform_scale_nextn_moe_ue8m0()
    def _weight_requant_ue8m0(self, is_nextn=False):
        weight_block_size = self.quant_config.weight_block_size
@@ -3174,6 +3189,28 @@ class DeepseekV2ForCausalLM(nn.Module):
                        module.weight, module.weight_scale_inv, weight_block_size
                    )
    # TODO avoid code dup (currently combine from weight_requant_ue8m0 and transform_scale_ue8m0)
    def _transform_scale_nextn_moe_ue8m0(self):
        layer = self.model.decoder
        shared_experts = getattr(layer.mlp, "shared_experts", None)
        if shared_experts is not None:
            for module in [
                shared_experts.gate_up_proj,
                shared_experts.down_proj,
            ]:
                transform_scale_ue8m0_inplace(
                    module.weight_scale_inv, mn=module.weight.shape[-2]
                )
        experts = layer.mlp.experts
        if isinstance(experts, DeepEPMoE):
            for w in [
                experts.w13_weight_fp8,
                experts.w2_weight_fp8,
            ]:
                transform_scale_ue8m0_inplace(w[1], mn=w[0].shape[-2])
    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=False):
        if is_nextn:
@@ -3189,6 +3226,11 @@ class DeepseekV2ForCausalLM(nn.Module):
            else:
                raise ValueError("num_nextn_predict_layers is not in the config")
        if is_nextn and enable_nextn_moe_bf16_cast_to_fp8(self.quant_config):
            weights = self._quant_nextn_moe_to_fp8_ue8m0(
                weights, nextn_layer_id=nextn_layer_id
            )
        stacked_params_mapping = [
            # (param_name, shard_name, shard_id)
            ("gate_up_proj", "gate_proj", 0),
@@ -3418,6 +3460,38 @@ class DeepseekV2ForCausalLM(nn.Module):
        self.post_load_weights(is_nextn=is_nextn, weight_names=weight_names)
    # TODO avoid code dup
    def _quant_nextn_moe_to_fp8_ue8m0(self, weights, nextn_layer_id: int):
        weights_dict = dict(weights)
        # temporarily only support DeepSeek V3/R1
        weight_block_size = [128, 128]
        for layer_id in [nextn_layer_id]:
            for expert_sub_name in [
                "shared_experts",
                *[
                    f"experts.{expert_id}"
                    for expert_id in range(self.config.n_routed_experts)
                ],
            ]:
                for stem in [
                    "gate_proj",
                    "up_proj",
                    "down_proj",
                ]:
                    partial_name = (
                        f"model.layers.{layer_id}.mlp.{expert_sub_name}.{stem}"
                    )
                    original_weight = weights_dict[f"{partial_name}.weight"]
                    out_w, out_s = quant_weight_ue8m0(
                        original_weight, weight_block_size=weight_block_size
                    )
                    weights_dict[f"{partial_name}.weight"] = out_w
                    weights_dict[f"{partial_name}.weight_scale_inv"] = out_s
        return list(weights_dict.items())
    def get_embed_and_head(self):
        return self.model.embed_tokens.weight, self.lm_head.weight