Support Llama4 fp8 inference (#5194)

Co-authored-by: laixinn <xielx@shanghaitech.edu.cn> Co-authored-by: sleepcoo <sleepcoo@gmail.com> Co-authored-by: zhyncs <me@zhyncs.com>
2025-04-09 20:14:34 +08:00
parent 86a876d883
commit 4065248214
14 changed files with 537 additions and 106 deletions
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -55,6 +55,7 @@ from sglang.srt.layers.moe.topk import select_experts
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.quantization.fp8_utils import (
    block_quant_to_tensor_quant,
+    channel_quant_to_tensor_quant,
    input_to_float8,
    normalize_e4m3fn_to_e4m3fnuz,
 )
@@ -1411,27 +1412,34 @@ class DeepseekV2ForCausalLM(nn.Module):
                    w = self_attn.kv_b_proj.weight
                # NOTE(HandH1998): Since `bmm_fp8` only supports per-tensor scale, we have to requantize `self_attn.kv_b_proj`.
                # This may affect the accuracy of fp8 model.
-                if hasattr(self.quant_config, "weight_block_size") and w.dtype in (
+                if w.dtype in (
                    torch.float8_e4m3fn,
                    torch.float8_e4m3fnuz,
                ):
-                    weight_block_size = self.quant_config.weight_block_size
-                    if weight_block_size is not None:
-                        assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
-                        if _is_hip:
-                            weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
-                                weight=w,
-                                weight_scale=self_attn.kv_b_proj.weight_scale_inv,
-                                input_scale=None,
-                            )
-                        else:
-                            weight = w
-                            weight_scale = self_attn.kv_b_proj.weight_scale_inv
+                    if hasattr(self.quant_config, "weight_block_size"):
+                        weight_block_size = self.quant_config.weight_block_size
+                        if weight_block_size is not None:
+                            assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                            if _is_hip:
+                                weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                                    weight=w,
+                                    weight_scale=self_attn.kv_b_proj.weight_scale_inv,
+                                    input_scale=None,
+                                )
+                            else:
+                                weight = w
+                                weight_scale = self_attn.kv_b_proj.weight_scale_inv

-                        w, scale = block_quant_to_tensor_quant(
-                            weight, weight_scale, weight_block_size
-                        )
+                            w, scale = block_quant_to_tensor_quant(
+                                weight, weight_scale, weight_block_size
+                            )
+                            self_attn.w_scale = scale
+                    else:
+                        weight = w
+                        weight_scale = self_attn.kv_b_proj.weight_scale
+                        w, scale = channel_quant_to_tensor_quant(weight, weight_scale)
                        self_attn.w_scale = scale
+
                if w.dtype == torch.int8:
                    if hasattr(self.quant_config, "weight_block_size"):
                        # block-wise int8 need it
--- a/python/sglang/srt/models/llama4.py
+++ b/python/sglang/srt/models/llama4.py
@@ -414,7 +414,7 @@ class Llama4Model(nn.Module):
            lambda idx, prefix: Llama4DecoderLayer(
                config=config, layer_id=idx, quant_config=quant_config, prefix=prefix
            ),
-            prefix="model.layers",
+            prefix=add_prefix("layers", prefix),
        )

        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
--- a/python/sglang/srt/models/mllama4.py
+++ b/python/sglang/srt/models/mllama4.py
@@ -7,6 +7,7 @@ from torch import nn
 from transformers import Llama4Config

 from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
 from sglang.srt.layers.quantization import QuantizationConfig
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
@@ -16,6 +17,7 @@ from sglang.srt.utils import add_prefix
 class Llama4ForConditionalGeneration(nn.Module):
    packed_modules_mapping = {
        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
    }

    def __init__(
@@ -96,6 +98,15 @@ class Llama4ForConditionalGeneration(nn.Module):

        num_experts = self.config.text_config.num_local_experts

+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=num_experts,
+        )
+
        for name, loaded_weight in weights:

            if name.startswith("vision_model") or name.startswith(
@@ -115,31 +126,54 @@ class Llama4ForConditionalGeneration(nn.Module):
                break
            else:
                if ".experts" in name:
-                    if ".gate_up_proj" in name:
-                        name_list = [
-                            name.replace(".experts.gate_up_proj", ".experts.w13_weight")
-                        ] * 2
-                        loaded_weight_list = loaded_weight.chunk(2, dim=-1)
-                        shard_id_list = ["w1", "w3"]
-                    else:
-                        name_list = [
-                            name.replace(".experts.down_proj", ".experts.w2_weight")
-                        ]
-                        shard_id_list = ["w2"]
-                        loaded_weight_list = [loaded_weight]
-                    for name, loaded_weight, shard_id in zip(
-                        name_list, loaded_weight_list, shard_id_list
+                    # NOTE: llama4 fp8 has different weight format for experts
+                    if (
+                        "experts.gate_up_proj" not in name
+                        and "experts.down_proj" not in name
                    ):
-                        param = params_dict[name]
-                        weight_loader = param.weight_loader
-                        for expert_id in range(num_experts):
+                        for mapping in expert_params_mapping:
+                            param_name, weight_name, expert_id, shard_id = mapping
+                            if weight_name not in name:
+                                continue
+                            name = name.replace(weight_name, param_name)
+                            param = params_dict[name]
+                            weight_loader = param.weight_loader
                            weight_loader(
                                param,
-                                loaded_weight[expert_id].T,
+                                loaded_weight,
                                name,
                                shard_id=shard_id,
                                expert_id=expert_id,
                            )
+                            break
+                    else:
+                        if ".gate_up_proj" in name:
+                            name_list = [
+                                name.replace(
+                                    ".experts.gate_up_proj", ".experts.w13_weight"
+                                )
+                            ] * 2
+                            loaded_weight_list = loaded_weight.chunk(2, dim=-1)
+                            shard_id_list = ["w1", "w3"]
+                        else:
+                            name_list = [
+                                name.replace(".experts.down_proj", ".experts.w2_weight")
+                            ]
+                            shard_id_list = ["w2"]
+                            loaded_weight_list = [loaded_weight]
+                        for name, loaded_weight, shard_id in zip(
+                            name_list, loaded_weight_list, shard_id_list
+                        ):
+                            param = params_dict[name]
+                            weight_loader = param.weight_loader
+                            for expert_id in range(num_experts):
+                                weight_loader(
+                                    param,
+                                    loaded_weight[expert_id].T,
+                                    name,
+                                    shard_id=shard_id,
+                                    expert_id=expert_id,
+                                )
                else:
                    # Skip loading extra bias for GPTQ models.
                    if name.endswith(".bias") and name not in params_dict: