[Minor] Fix grok model loader (#2473)
This commit is contained in:
@@ -25,9 +25,11 @@ from transformers import PretrainedConfig
|
|||||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||||
|
|
||||||
|
from sglang.srt.layers.activation import GeluAndMul
|
||||||
from sglang.srt.layers.fused_moe_triton import FusedMoE
|
from sglang.srt.layers.fused_moe_triton import FusedMoE
|
||||||
from sglang.srt.layers.layernorm import RMSNorm
|
from sglang.srt.layers.layernorm import RMSNorm
|
||||||
from sglang.srt.layers.linear import (
|
from sglang.srt.layers.linear import (
|
||||||
|
MergedColumnParallelLinear,
|
||||||
QKVParallelLinear,
|
QKVParallelLinear,
|
||||||
ReplicatedLinear,
|
ReplicatedLinear,
|
||||||
RowParallelLinear,
|
RowParallelLinear,
|
||||||
@@ -40,10 +42,43 @@ from sglang.srt.layers.vocab_parallel_embedding import (
|
|||||||
VocabParallelEmbedding,
|
VocabParallelEmbedding,
|
||||||
)
|
)
|
||||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
||||||
from sglang.srt.model_loader.loader import DefaultModelLoader
|
|
||||||
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
||||||
|
|
||||||
|
|
||||||
|
class Grok1MLP(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
hidden_size: int,
|
||||||
|
intermediate_size: int,
|
||||||
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
prefix: str = "",
|
||||||
|
reduce_results=True,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.gate_up_proj = MergedColumnParallelLinear(
|
||||||
|
hidden_size,
|
||||||
|
[intermediate_size] * 2,
|
||||||
|
bias=False,
|
||||||
|
quant_config=quant_config,
|
||||||
|
prefix=f"{prefix}.gate_up_proj",
|
||||||
|
)
|
||||||
|
self.down_proj = RowParallelLinear(
|
||||||
|
intermediate_size,
|
||||||
|
hidden_size,
|
||||||
|
bias=False,
|
||||||
|
quant_config=quant_config,
|
||||||
|
prefix=f"{prefix}.down_proj",
|
||||||
|
reduce_results=reduce_results,
|
||||||
|
)
|
||||||
|
self.act_fn = GeluAndMul(approximate="tanh")
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
gate_up, _ = self.gate_up_proj(x)
|
||||||
|
x = self.act_fn(gate_up)
|
||||||
|
x, _ = self.down_proj(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
class Grok1MoE(nn.Module):
|
class Grok1MoE(nn.Module):
|
||||||
"""A tensor-parallel MoE implementation for Grok1 that shards each expert
|
"""A tensor-parallel MoE implementation for Grok1 that shards each expert
|
||||||
across all ranks.
|
across all ranks.
|
||||||
@@ -55,6 +90,7 @@ class Grok1MoE(nn.Module):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
config: PretrainedConfig,
|
||||||
num_experts: int,
|
num_experts: int,
|
||||||
top_k: int,
|
top_k: int,
|
||||||
hidden_size: int,
|
hidden_size: int,
|
||||||
@@ -62,6 +98,7 @@ class Grok1MoE(nn.Module):
|
|||||||
params_dtype: Optional[torch.dtype] = None,
|
params_dtype: Optional[torch.dtype] = None,
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
tp_size: Optional[int] = None,
|
tp_size: Optional[int] = None,
|
||||||
|
reduce_results=True,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.hidden_size = hidden_size
|
self.hidden_size = hidden_size
|
||||||
@@ -75,13 +112,16 @@ class Grok1MoE(nn.Module):
|
|||||||
quant_config=None,
|
quant_config=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.router_logit_softcapping = getattr(
|
||||||
|
config, "router_logit_softcapping", 30.0
|
||||||
|
)
|
||||||
self.experts = FusedMoE(
|
self.experts = FusedMoE(
|
||||||
num_experts=num_experts,
|
num_experts=num_experts,
|
||||||
top_k=top_k,
|
top_k=top_k,
|
||||||
hidden_size=hidden_size,
|
hidden_size=hidden_size,
|
||||||
intermediate_size=intermediate_size,
|
intermediate_size=intermediate_size,
|
||||||
params_dtype=params_dtype,
|
params_dtype=params_dtype,
|
||||||
reduce_results=True,
|
reduce_results=reduce_results,
|
||||||
renormalize=False,
|
renormalize=False,
|
||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
tp_size=tp_size,
|
tp_size=tp_size,
|
||||||
@@ -91,9 +131,12 @@ class Grok1MoE(nn.Module):
|
|||||||
# NOTE: hidden_states can have either 1D or 2D shape.
|
# NOTE: hidden_states can have either 1D or 2D shape.
|
||||||
orig_shape = hidden_states.shape
|
orig_shape = hidden_states.shape
|
||||||
hidden_states = hidden_states.view(-1, self.hidden_size)
|
hidden_states = hidden_states.view(-1, self.hidden_size)
|
||||||
|
|
||||||
# router_logits: (num_tokens, n_experts)
|
# router_logits: (num_tokens, n_experts)
|
||||||
router_logits, _ = self.gate(hidden_states)
|
router_logits, _ = self.gate(hidden_states)
|
||||||
router_logits = 30.0 * F.tanh(router_logits / 30.0)
|
router_logits = 30.0 * F.tanh(router_logits / 30.0)
|
||||||
|
|
||||||
|
# need to assert self.gate.quant_method is unquantized
|
||||||
final_hidden_states = self.experts(hidden_states, router_logits)
|
final_hidden_states = self.experts(hidden_states, router_logits)
|
||||||
return final_hidden_states.view(orig_shape)
|
return final_hidden_states.view(orig_shape)
|
||||||
|
|
||||||
@@ -101,16 +144,18 @@ class Grok1MoE(nn.Module):
|
|||||||
class Grok1Attention(nn.Module):
|
class Grok1Attention(nn.Module):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
config: PretrainedConfig,
|
||||||
hidden_size: int,
|
hidden_size: int,
|
||||||
num_heads: int,
|
num_heads: int,
|
||||||
num_kv_heads: int,
|
num_kv_heads: int,
|
||||||
layer_id: int = 0,
|
layer_id: int = 0,
|
||||||
max_position: int = 4096 * 32,
|
max_position: int = 4096 * 32,
|
||||||
rope_theta: float = 10000,
|
rope_theta: float = 10000,
|
||||||
logit_cap: float = 30,
|
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
self.config = config
|
||||||
|
self.layer_id = layer_id
|
||||||
self.hidden_size = hidden_size
|
self.hidden_size = hidden_size
|
||||||
tp_size = get_tensor_model_parallel_world_size()
|
tp_size = get_tensor_model_parallel_world_size()
|
||||||
self.total_num_heads = num_heads
|
self.total_num_heads = num_heads
|
||||||
@@ -126,7 +171,7 @@ class Grok1Attention(nn.Module):
|
|||||||
# the KV heads across multiple tensor parallel GPUs.
|
# the KV heads across multiple tensor parallel GPUs.
|
||||||
assert tp_size % self.total_num_kv_heads == 0
|
assert tp_size % self.total_num_kv_heads == 0
|
||||||
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
||||||
self.head_dim = 128
|
self.head_dim = getattr(config, "head_dim", 128)
|
||||||
self.q_size = self.num_heads * self.head_dim
|
self.q_size = self.num_heads * self.head_dim
|
||||||
self.kv_size = self.num_kv_heads * self.head_dim
|
self.kv_size = self.num_kv_heads * self.head_dim
|
||||||
self.scaling = self.head_dim**-0.5
|
self.scaling = self.head_dim**-0.5
|
||||||
@@ -140,7 +185,6 @@ class Grok1Attention(nn.Module):
|
|||||||
bias=False,
|
bias=False,
|
||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.o_proj = RowParallelLinear(
|
self.o_proj = RowParallelLinear(
|
||||||
self.total_num_heads * self.head_dim,
|
self.total_num_heads * self.head_dim,
|
||||||
hidden_size,
|
hidden_size,
|
||||||
@@ -154,6 +198,9 @@ class Grok1Attention(nn.Module):
|
|||||||
base=int(self.rope_theta),
|
base=int(self.rope_theta),
|
||||||
is_neox_style=True,
|
is_neox_style=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logit_cap = max(getattr(config, "attn_logit_softcapping", 30.0), 0.0)
|
||||||
|
|
||||||
self.attn = RadixAttention(
|
self.attn = RadixAttention(
|
||||||
self.num_heads,
|
self.num_heads,
|
||||||
self.head_dim,
|
self.head_dim,
|
||||||
@@ -162,7 +209,6 @@ class Grok1Attention(nn.Module):
|
|||||||
layer_id=layer_id,
|
layer_id=layer_id,
|
||||||
logit_cap=logit_cap,
|
logit_cap=logit_cap,
|
||||||
)
|
)
|
||||||
# TODO(lianmin): load logit cap from config
|
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
@@ -186,10 +232,12 @@ class Grok1DecoderLayer(nn.Module):
|
|||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
self.num_experts = config.num_local_experts
|
||||||
self.hidden_size = config.hidden_size
|
self.hidden_size = config.hidden_size
|
||||||
|
|
||||||
rope_theta = getattr(config, "rope_theta", 10000)
|
rope_theta = getattr(config, "rope_theta", 10000)
|
||||||
self.self_attn = Grok1Attention(
|
self.self_attn = Grok1Attention(
|
||||||
|
config=config,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_heads=config.num_attention_heads,
|
num_heads=config.num_attention_heads,
|
||||||
max_position=config.max_position_embeddings,
|
max_position=config.max_position_embeddings,
|
||||||
@@ -199,11 +247,17 @@ class Grok1DecoderLayer(nn.Module):
|
|||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
)
|
)
|
||||||
self.block_sparse_moe = Grok1MoE(
|
self.block_sparse_moe = Grok1MoE(
|
||||||
|
config=config,
|
||||||
num_experts=config.num_local_experts,
|
num_experts=config.num_local_experts,
|
||||||
top_k=config.num_experts_per_tok,
|
top_k=config.num_experts_per_tok,
|
||||||
hidden_size=config.hidden_size,
|
hidden_size=config.hidden_size,
|
||||||
intermediate_size=config.intermediate_size,
|
intermediate_size=getattr(
|
||||||
|
config,
|
||||||
|
"moe_intermediate_size",
|
||||||
|
getattr(config, "intermediate_size", None),
|
||||||
|
),
|
||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
|
reduce_results=True,
|
||||||
)
|
)
|
||||||
self.pre_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
self.pre_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||||
self.post_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
self.post_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||||
@@ -284,6 +338,7 @@ class Grok1ForCausalLM(nn.Module):
|
|||||||
self,
|
self,
|
||||||
config: PretrainedConfig,
|
config: PretrainedConfig,
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
cache_config=None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
@@ -310,6 +365,8 @@ class Grok1ForCausalLM(nn.Module):
|
|||||||
("qkv_proj", "q_proj", "q"),
|
("qkv_proj", "q_proj", "q"),
|
||||||
("qkv_proj", "k_proj", "k"),
|
("qkv_proj", "k_proj", "k"),
|
||||||
("qkv_proj", "v_proj", "v"),
|
("qkv_proj", "v_proj", "v"),
|
||||||
|
("gate_up_proj", "gate_proj", 0),
|
||||||
|
("gate_up_proj", "up_proj", 1),
|
||||||
]
|
]
|
||||||
|
|
||||||
# Params for weights, fp8 weight scales, fp8 activation scales
|
# Params for weights, fp8 weight scales, fp8 activation scales
|
||||||
@@ -345,6 +402,11 @@ class Grok1ForCausalLM(nn.Module):
|
|||||||
continue
|
continue
|
||||||
name = name.replace(weight_name, param_name)
|
name = name.replace(weight_name, param_name)
|
||||||
|
|
||||||
|
if (
|
||||||
|
name.endswith(".bias") or name.endswith("_bias")
|
||||||
|
) and name not in params_dict:
|
||||||
|
continue
|
||||||
|
|
||||||
param = params_dict[name]
|
param = params_dict[name]
|
||||||
weight_loader = param.weight_loader
|
weight_loader = param.weight_loader
|
||||||
weight_loader(
|
weight_loader(
|
||||||
@@ -357,7 +419,9 @@ class Grok1ForCausalLM(nn.Module):
|
|||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
# Skip loading extra bias for GPTQ models.
|
# Skip loading extra bias for GPTQ models.
|
||||||
if name.endswith(".bias") and name not in params_dict:
|
if (
|
||||||
|
name.endswith(".bias") or name.endswith("_bias")
|
||||||
|
) and name not in params_dict:
|
||||||
continue
|
continue
|
||||||
# Skip loading kv_scale from ckpts towards new design.
|
# Skip loading kv_scale from ckpts towards new design.
|
||||||
if name.endswith(".kv_scale") and name not in params_dict:
|
if name.endswith(".kv_scale") and name not in params_dict:
|
||||||
|
|||||||
Reference in New Issue
Block a user