Support LingV2 model (#10359)
Co-authored-by: 羽癫 <yudian.zy@antgroup.com> Co-authored-by: guoyuhong <yuhong.gyh@antgroup.com>
This commit is contained in:
@@ -141,6 +141,11 @@ class ModelConfig:
|
||||
|
||||
if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
|
||||
self.hf_config.architectures[0] = "MiMoMTP"
|
||||
if is_draft_model and self.hf_config.architectures[0] in [
|
||||
"BailingMoeV2ForCausalLM",
|
||||
"BailingMoeForCausalLM",
|
||||
]:
|
||||
self.hf_config.architectures[0] = "BailingMoeForCausalLMNextN"
|
||||
if (
|
||||
is_draft_model
|
||||
and self.hf_config.architectures[0] == "Ernie4_5_MoeForCausalLM"
|
||||
|
||||
@@ -893,6 +893,35 @@ class QKVParallelLinear(ColumnParallelLinear):
|
||||
)
|
||||
self.weight_loader_v2(param, loaded_weight_shard, shard_id)
|
||||
|
||||
def _load_qkv_block_scale(
|
||||
self, param: BasevLLMParameter, loaded_weight: torch.Tensor
|
||||
):
|
||||
block_n, _ = self.quant_method.quant_config.weight_block_size
|
||||
q_size = self.total_num_heads * self.head_size // block_n
|
||||
k_size = self.total_num_kv_heads * self.head_size // block_n
|
||||
v_size = self.total_num_kv_heads * self.head_size // block_n
|
||||
shard_offsets = [
|
||||
# (shard_id, shard_offset, shard_size)
|
||||
("q", 0, q_size),
|
||||
("k", q_size, k_size),
|
||||
("v", q_size + k_size, v_size),
|
||||
]
|
||||
for shard_id, shard_offset, shard_size in shard_offsets:
|
||||
loaded_weight_shard = loaded_weight.narrow(
|
||||
param.output_dim, shard_offset, shard_size
|
||||
)
|
||||
rank_shard_offset = self._get_shard_offset_mapping(shard_id) // block_n
|
||||
rank_shard_size = self._get_shard_size_mapping(shard_id) // block_n
|
||||
param.load_qkv_weight(
|
||||
loaded_weight=loaded_weight_shard,
|
||||
num_heads=self.num_kv_head_replicas,
|
||||
shard_id=shard_id,
|
||||
shard_offset=rank_shard_offset,
|
||||
shard_size=rank_shard_size,
|
||||
tp_rank=self.tp_rank,
|
||||
use_presharded_weights=self.use_presharded_weights,
|
||||
)
|
||||
|
||||
def weight_loader_v2(
|
||||
self,
|
||||
param: BasevLLMParameter,
|
||||
@@ -906,6 +935,9 @@ class QKVParallelLinear(ColumnParallelLinear):
|
||||
elif type(param) in (RowvLLMParameter, BasevLLMParameter):
|
||||
param.load_qkv_weight(loaded_weight=loaded_weight)
|
||||
return
|
||||
elif isinstance(param, BlockQuantScaleParameter):
|
||||
self._load_qkv_block_scale(param, loaded_weight)
|
||||
return
|
||||
# TODO: @dsikka - move to parameter.py
|
||||
self._load_fused_module_from_checkpoint(param, loaded_weight)
|
||||
return
|
||||
|
||||
@@ -0,0 +1,146 @@
|
||||
{
|
||||
"1": {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 64,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 4,
|
||||
"num_stages": 4
|
||||
},
|
||||
"2": {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 64,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 4,
|
||||
"num_stages": 3
|
||||
},
|
||||
"4": {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 64,
|
||||
"BLOCK_SIZE_K": 128,
|
||||
"GROUP_SIZE_M": 16,
|
||||
"num_warps": 4,
|
||||
"num_stages": 2
|
||||
},
|
||||
"8": {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 64,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 64,
|
||||
"num_warps": 4,
|
||||
"num_stages": 4
|
||||
},
|
||||
"16": {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 8,
|
||||
"num_stages": 5
|
||||
},
|
||||
"24": {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 8,
|
||||
"num_stages": 5
|
||||
},
|
||||
"32": {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 8,
|
||||
"num_stages": 5
|
||||
},
|
||||
"48": {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 8,
|
||||
"num_stages": 5
|
||||
},
|
||||
"64": {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 128,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 8,
|
||||
"num_stages": 3
|
||||
},
|
||||
"96": {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 64,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 4,
|
||||
"num_stages": 5
|
||||
},
|
||||
"128": {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 4,
|
||||
"num_stages": 3
|
||||
},
|
||||
"256": {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 64,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 4,
|
||||
"num_stages": 4
|
||||
},
|
||||
"512": {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 256,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 4,
|
||||
"num_stages": 3
|
||||
},
|
||||
"1024": {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 8,
|
||||
"num_stages": 3
|
||||
},
|
||||
"1536": {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 8,
|
||||
"num_stages": 3
|
||||
},
|
||||
"2048": {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": 64,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"num_warps": 4,
|
||||
"num_stages": 3
|
||||
},
|
||||
"3072": {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 32,
|
||||
"num_warps": 4,
|
||||
"num_stages": 3
|
||||
},
|
||||
"4096": {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 32,
|
||||
"num_warps": 4,
|
||||
"num_stages": 3
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
168
python/sglang/srt/models/bailing_moe_nextn.py
Normal file
168
python/sglang/srt/models/bailing_moe_nextn.py
Normal file
@@ -0,0 +1,168 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 Antgroup and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" SGLang BailingMoENextN model."""
|
||||
import logging
|
||||
from typing import Iterable, Optional, Tuple
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
||||
from sglang.srt.layers.dp_attention import is_dp_attention_enabled
|
||||
from sglang.srt.layers.layernorm import RMSNorm
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.moe.topk import select_experts
|
||||
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
||||
from sglang.srt.layers.vocab_parallel_embedding import (
|
||||
ParallelLMHead,
|
||||
VocabParallelEmbedding,
|
||||
)
|
||||
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
||||
from sglang.srt.models.bailing_moe import BailingMoEBlock, BailingMoEForCausalLM
|
||||
from sglang.srt.utils import add_prefix
|
||||
|
||||
LoraConfig = None
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BailingMoEModelNextN(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
if quant_config is not None and quant_config.get_name() == "modelopt_fp4":
|
||||
logger.warning(
|
||||
"Overriding DeepseekV3ForCausalLMNextN quant config for modelopt_fp4 Deepseek model."
|
||||
)
|
||||
quant_config = None
|
||||
|
||||
self.vocab_size = config.vocab_size
|
||||
|
||||
self.word_embeddings = VocabParallelEmbedding(
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
enable_tp=not is_dp_attention_enabled(),
|
||||
prefix=add_prefix("word_embeddings", prefix),
|
||||
)
|
||||
|
||||
self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
|
||||
self.eh_proj = nn.Linear(2 * config.hidden_size, config.hidden_size, bias=False)
|
||||
|
||||
self.decoder = BailingMoEBlock(
|
||||
config,
|
||||
0,
|
||||
quant_config=quant_config,
|
||||
# is_nextn=True,
|
||||
prefix=add_prefix("decoder", prefix),
|
||||
)
|
||||
|
||||
self.shared_head = nn.Module()
|
||||
self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
forward_batch: ForwardBatch,
|
||||
input_embeds: torch.Tensor = None,
|
||||
) -> torch.Tensor:
|
||||
|
||||
if input_embeds is None:
|
||||
hidden_states = self.word_embeddings(input_ids)
|
||||
else:
|
||||
hidden_states = input_embeds
|
||||
|
||||
if hidden_states.shape[0] > 0:
|
||||
hidden_states = self.eh_proj(
|
||||
torch.cat(
|
||||
(
|
||||
self.enorm(hidden_states),
|
||||
self.hnorm(forward_batch.spec_info.hidden_states),
|
||||
),
|
||||
dim=-1,
|
||||
)
|
||||
)
|
||||
|
||||
residual = None
|
||||
hidden_states, residual = self.decoder(
|
||||
positions, hidden_states, forward_batch, residual
|
||||
)
|
||||
|
||||
if not forward_batch.forward_mode.is_idle():
|
||||
if residual is not None:
|
||||
hidden_states, _ = self.final_layernorm(hidden_states, residual)
|
||||
else:
|
||||
hidden_states = self.final_layernorm(hidden_states)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class BailingMoeForCausalLMNextN(BailingMoEForCausalLM):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
nn.Module.__init__(self)
|
||||
self.config = config
|
||||
self.tp_size = get_tensor_model_parallel_world_size()
|
||||
self.quant_config = quant_config
|
||||
if hasattr(self, "determine_num_fused_shared_experts"):
|
||||
# Asystem has determine_num_fused_shared_experts but theta does not.
|
||||
self.determine_num_fused_shared_experts("BailingMoeForCausalLMNextN")
|
||||
|
||||
self.model = BailingMoEModelNextN(
|
||||
config, quant_config, prefix=add_prefix("model", prefix)
|
||||
)
|
||||
self.lm_head = ParallelLMHead(
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
quant_config=quant_config,
|
||||
prefix=add_prefix("model.shared_head.head", prefix),
|
||||
use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
|
||||
)
|
||||
self.logits_processor = LogitsProcessor(config)
|
||||
|
||||
@torch.no_grad()
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
forward_batch: ForwardBatch,
|
||||
) -> torch.Tensor:
|
||||
hidden_states = self.model(input_ids, positions, forward_batch)
|
||||
return self.logits_processor(
|
||||
input_ids, hidden_states, self.lm_head, forward_batch
|
||||
)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
super().load_weights(weights, is_nextn=True)
|
||||
|
||||
|
||||
EntryClass = [BailingMoeForCausalLMNextN]
|
||||
@@ -754,7 +754,12 @@ class ServerArgs:
|
||||
)
|
||||
|
||||
model_arch = self.get_hf_config().architectures[0]
|
||||
if model_arch in ["DeepseekV3ForCausalLM", "Glm4MoeForCausalLM"]:
|
||||
if model_arch in [
|
||||
"DeepseekV3ForCausalLM",
|
||||
"Glm4MoeForCausalLM",
|
||||
"BailingMoeV2ForCausalLM",
|
||||
"BailingMoeV2ForCausalLM",
|
||||
]:
|
||||
# Auto set draft_model_path DeepSeek-V3/R1
|
||||
if self.speculative_draft_model_path is None:
|
||||
self.speculative_draft_model_path = self.model_path
|
||||
@@ -2724,6 +2729,8 @@ def auto_choose_speculative_params(self: ServerArgs):
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekV2ForCausalLM",
|
||||
"GptOssForCausalLM",
|
||||
"BailingMoeForCausalLM",
|
||||
"BailingMoeV2ForCausalLM",
|
||||
]:
|
||||
# The default value for deepseek and gpt-oss
|
||||
return (3, 1, 4)
|
||||
|
||||
Reference in New Issue
Block a user