Support Llama4 fp8 inference (#5194)
Co-authored-by: laixinn <xielx@shanghaitech.edu.cn> Co-authored-by: sleepcoo <sleepcoo@gmail.com> Co-authored-by: zhyncs <me@zhyncs.com>
This commit is contained in:
@@ -55,6 +55,7 @@ from sglang.srt.layers.moe.topk import select_experts
|
||||
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
||||
from sglang.srt.layers.quantization.fp8_utils import (
|
||||
block_quant_to_tensor_quant,
|
||||
channel_quant_to_tensor_quant,
|
||||
input_to_float8,
|
||||
normalize_e4m3fn_to_e4m3fnuz,
|
||||
)
|
||||
@@ -1411,27 +1412,34 @@ class DeepseekV2ForCausalLM(nn.Module):
|
||||
w = self_attn.kv_b_proj.weight
|
||||
# NOTE(HandH1998): Since `bmm_fp8` only supports per-tensor scale, we have to requantize `self_attn.kv_b_proj`.
|
||||
# This may affect the accuracy of fp8 model.
|
||||
if hasattr(self.quant_config, "weight_block_size") and w.dtype in (
|
||||
if w.dtype in (
|
||||
torch.float8_e4m3fn,
|
||||
torch.float8_e4m3fnuz,
|
||||
):
|
||||
weight_block_size = self.quant_config.weight_block_size
|
||||
if weight_block_size is not None:
|
||||
assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
|
||||
if _is_hip:
|
||||
weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
|
||||
weight=w,
|
||||
weight_scale=self_attn.kv_b_proj.weight_scale_inv,
|
||||
input_scale=None,
|
||||
)
|
||||
else:
|
||||
weight = w
|
||||
weight_scale = self_attn.kv_b_proj.weight_scale_inv
|
||||
if hasattr(self.quant_config, "weight_block_size"):
|
||||
weight_block_size = self.quant_config.weight_block_size
|
||||
if weight_block_size is not None:
|
||||
assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
|
||||
if _is_hip:
|
||||
weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
|
||||
weight=w,
|
||||
weight_scale=self_attn.kv_b_proj.weight_scale_inv,
|
||||
input_scale=None,
|
||||
)
|
||||
else:
|
||||
weight = w
|
||||
weight_scale = self_attn.kv_b_proj.weight_scale_inv
|
||||
|
||||
w, scale = block_quant_to_tensor_quant(
|
||||
weight, weight_scale, weight_block_size
|
||||
)
|
||||
w, scale = block_quant_to_tensor_quant(
|
||||
weight, weight_scale, weight_block_size
|
||||
)
|
||||
self_attn.w_scale = scale
|
||||
else:
|
||||
weight = w
|
||||
weight_scale = self_attn.kv_b_proj.weight_scale
|
||||
w, scale = channel_quant_to_tensor_quant(weight, weight_scale)
|
||||
self_attn.w_scale = scale
|
||||
|
||||
if w.dtype == torch.int8:
|
||||
if hasattr(self.quant_config, "weight_block_size"):
|
||||
# block-wise int8 need it
|
||||
|
||||
@@ -414,7 +414,7 @@ class Llama4Model(nn.Module):
|
||||
lambda idx, prefix: Llama4DecoderLayer(
|
||||
config=config, layer_id=idx, quant_config=quant_config, prefix=prefix
|
||||
),
|
||||
prefix="model.layers",
|
||||
prefix=add_prefix("layers", prefix),
|
||||
)
|
||||
|
||||
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
|
||||
@@ -7,6 +7,7 @@ from torch import nn
|
||||
from transformers import Llama4Config
|
||||
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
|
||||
from sglang.srt.layers.quantization import QuantizationConfig
|
||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
||||
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
||||
@@ -16,6 +17,7 @@ from sglang.srt.utils import add_prefix
|
||||
class Llama4ForConditionalGeneration(nn.Module):
|
||||
packed_modules_mapping = {
|
||||
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
@@ -96,6 +98,15 @@ class Llama4ForConditionalGeneration(nn.Module):
|
||||
|
||||
num_experts = self.config.text_config.num_local_experts
|
||||
|
||||
# Params for weights, fp8 weight scales, fp8 activation scales
|
||||
# (param_name, weight_name, expert_id, shard_id)
|
||||
expert_params_mapping = FusedMoE.make_expert_params_mapping(
|
||||
ckpt_gate_proj_name="gate_proj",
|
||||
ckpt_down_proj_name="down_proj",
|
||||
ckpt_up_proj_name="up_proj",
|
||||
num_experts=num_experts,
|
||||
)
|
||||
|
||||
for name, loaded_weight in weights:
|
||||
|
||||
if name.startswith("vision_model") or name.startswith(
|
||||
@@ -115,31 +126,54 @@ class Llama4ForConditionalGeneration(nn.Module):
|
||||
break
|
||||
else:
|
||||
if ".experts" in name:
|
||||
if ".gate_up_proj" in name:
|
||||
name_list = [
|
||||
name.replace(".experts.gate_up_proj", ".experts.w13_weight")
|
||||
] * 2
|
||||
loaded_weight_list = loaded_weight.chunk(2, dim=-1)
|
||||
shard_id_list = ["w1", "w3"]
|
||||
else:
|
||||
name_list = [
|
||||
name.replace(".experts.down_proj", ".experts.w2_weight")
|
||||
]
|
||||
shard_id_list = ["w2"]
|
||||
loaded_weight_list = [loaded_weight]
|
||||
for name, loaded_weight, shard_id in zip(
|
||||
name_list, loaded_weight_list, shard_id_list
|
||||
# NOTE: llama4 fp8 has different weight format for experts
|
||||
if (
|
||||
"experts.gate_up_proj" not in name
|
||||
and "experts.down_proj" not in name
|
||||
):
|
||||
param = params_dict[name]
|
||||
weight_loader = param.weight_loader
|
||||
for expert_id in range(num_experts):
|
||||
for mapping in expert_params_mapping:
|
||||
param_name, weight_name, expert_id, shard_id = mapping
|
||||
if weight_name not in name:
|
||||
continue
|
||||
name = name.replace(weight_name, param_name)
|
||||
param = params_dict[name]
|
||||
weight_loader = param.weight_loader
|
||||
weight_loader(
|
||||
param,
|
||||
loaded_weight[expert_id].T,
|
||||
loaded_weight,
|
||||
name,
|
||||
shard_id=shard_id,
|
||||
expert_id=expert_id,
|
||||
)
|
||||
break
|
||||
else:
|
||||
if ".gate_up_proj" in name:
|
||||
name_list = [
|
||||
name.replace(
|
||||
".experts.gate_up_proj", ".experts.w13_weight"
|
||||
)
|
||||
] * 2
|
||||
loaded_weight_list = loaded_weight.chunk(2, dim=-1)
|
||||
shard_id_list = ["w1", "w3"]
|
||||
else:
|
||||
name_list = [
|
||||
name.replace(".experts.down_proj", ".experts.w2_weight")
|
||||
]
|
||||
shard_id_list = ["w2"]
|
||||
loaded_weight_list = [loaded_weight]
|
||||
for name, loaded_weight, shard_id in zip(
|
||||
name_list, loaded_weight_list, shard_id_list
|
||||
):
|
||||
param = params_dict[name]
|
||||
weight_loader = param.weight_loader
|
||||
for expert_id in range(num_experts):
|
||||
weight_loader(
|
||||
param,
|
||||
loaded_weight[expert_id].T,
|
||||
name,
|
||||
shard_id=shard_id,
|
||||
expert_id=expert_id,
|
||||
)
|
||||
else:
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
|
||||
Reference in New Issue
Block a user