# SPDX-License-Identifier: Apache-2.0 # Copyright 2023 The vLLM team. # Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX # and OPT implementations in this library. It has been modified from its # original forms to accommodate minor architectural differences compared # to GPT-NeoX and OPT used by the Meta AI team that trained the model. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # <<<<<<< HEAD # # Adapted from # # vllm-project/vllm/blob/main/vllm/model_executor/models/deepseek_v2.py # # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # # vllm-project/vllm/vllm/model_executor/models/deepseek_v2.py # """Inference-only DeepseekV2/DeepseekV3 model.""" # from typing import Optional, Union # import torch # from torch import nn # from transformers import PretrainedConfig # from vllm.config import CacheConfig, ModelConfig, VllmConfig # from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size # from vllm.model_executor.layers.fused_moe import FusedMoE # from vllm.model_executor.layers.layernorm import RMSNorm # from vllm.model_executor.layers.linear import ReplicatedLinear # from vllm.model_executor.layers.logits_processor import LogitsProcessor # from vllm.model_executor.layers.quantization import QuantizationConfig # from vllm.model_executor.layers.sampler import get_sampler # from vllm.model_executor.layers.vocab_parallel_embedding import ( # ParallelLMHead, VocabParallelEmbedding) # from vllm.model_executor.models.deepseek_v2 import ( # noqa # DeepseekV2Attention, DeepseekV2DecoderLayer, DeepseekV2ForCausalLM, # DeepseekV2MLAAttention, DeepseekV2MLP, DeepseekV2MoE) # ======= import os from typing import Any, Dict, Optional, Union import torch import torch.distributed as dist from torch import nn from transformers import PretrainedConfig from vllm.attention import Attention from vllm.config import (CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config) from vllm.distributed import (get_dp_group, get_pp_group, get_tensor_model_parallel_world_size, get_tp_group, tensor_model_parallel_all_reduce) from vllm.forward_context import get_forward_context from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.models.deepseek_v2 import \ DeepseekV2ForCausalLM # ruff: noqa: E501 from vllm.model_executor.models.deepseek_v2 import \ yarn_get_mscale # ruff: noqa: E501 from vllm.model_executor.models.deepseek_v2 import (DeepseekV2Attention, DeepseekV2DecoderLayer, DeepseekV2MLAAttention, DeepseekV2MLP) from vllm.model_executor.models.utils import ( PPMissingLayer, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) # >>>>>>> dcd5c73 (Feat: Graph mode for deepseek v2/v3.) from vllm.sequence import IntermediateTensors from vllm_ascend.ops.fused_moe import AscendFusedMoE from vllm_ascend.utils import VLLM_ENABLE_GRAPH_MODE class CustomDeepseekV2MoE(nn.Module): top_k: int def __init__( self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ): super().__init__() self.tp_size = get_tensor_model_parallel_world_size() self.routed_scaling_factor = config.routed_scaling_factor self.n_shared_experts = config.n_shared_experts self.routed_scaling_factor = config.routed_scaling_factor if self.tp_size > config.n_routed_experts: raise ValueError( f"Tensor parallel size {self.tp_size} is greater than " f"the number of experts {config.n_routed_experts}.") if config.hidden_act != "silu": raise ValueError(f"Unsupported activation: {config.hidden_act}. " "Only silu is supported for now.") self.gate = ReplicatedLinear(config.hidden_size, config.n_routed_experts, bias=False, quant_config=None, prefix=f"{prefix}.gate") if config.topk_method == "noaux_tc": self.gate.e_score_correction_bias = nn.Parameter( torch.empty(config.n_routed_experts)) else: self.gate.e_score_correction_bias = None self.experts = AscendFusedMoE( num_experts=config.n_routed_experts, top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, reduce_results=False, renormalize=config.norm_topk_prob, quant_config=quant_config, use_grouped_topk=True, num_expert_group=config.n_group, topk_group=config.topk_group, prefix=f"{prefix}.experts", scoring_func=config.scoring_func, e_score_correction_bias=self.gate.e_score_correction_bias) if config.n_shared_experts is not None: intermediate_size = (config.moe_intermediate_size * config.n_shared_experts) self.shared_experts = DeepseekV2MLP( hidden_size=config.hidden_size, intermediate_size=intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, reduce_results=True, prefix=f"{prefix}.shared_experts", ) CustomDeepseekV2MoE.top_k = config.num_experts_per_tok vllm_config = get_current_vllm_config() self.dp_size = get_dp_group().world_size batch_size = vllm_config.scheduler_config.max_num_seqs self.enable_mc2 = int(os.environ.get("VLLM_ENABLE_MC2", 0)) == 1 params_dtype = torch.get_default_dtype() self.final_hidden_states = torch.zeros( [batch_size, config.hidden_size], dtype=params_dtype, device="npu") self.tp_group = get_tp_group().device_group def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: attn_metadata = get_forward_context().attn_metadata if attn_metadata is None: # for profile run return hidden_states num_tokens, hidden_dim = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_dim) if self.n_shared_experts is not None: shared_output = self.shared_experts(hidden_states) if (self.tp_size > 1 and self.enable_mc2 and attn_metadata.num_prefills == 0): # hidden_states = dist._functional_collectives.reduce_scatter_tensor( # hidden_states, "sum", scatter_dim=0, group=self.tp_group # ) chunks = torch.chunk(hidden_states, get_tp_group().world_size, dim=0) hidden_states = chunks[get_tp_group().rank_in_group] # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) is_prefill = True if attn_metadata.num_prefills > 0 else False # is_prefill = attn_metadata.num_prefills > 0 final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits, is_prefill=is_prefill, top_k=CustomDeepseekV2MoE.top_k) * self.routed_scaling_factor if self.tp_size > 1: if self.enable_mc2 and not is_prefill: dist.all_gather_into_tensor(self.final_hidden_states, final_hidden_states, self.tp_group) final_hidden_states = self.final_hidden_states else: final_hidden_states = tensor_model_parallel_all_reduce( final_hidden_states) if shared_output is not None: final_hidden_states = final_hidden_states + shared_output return final_hidden_states.view(num_tokens, hidden_dim) class CustomDeepseekV2MLAAttention(DeepseekV2MLAAttention): def __init__( self, config: PretrainedConfig, hidden_size: int, num_heads: int, qk_nope_head_dim: int, qk_rope_head_dim: int, v_head_dim: int, q_lora_rank: Optional[int], kv_lora_rank: int, rope_theta: float = 10000, rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: nn.Module.__init__(self) self.hidden_size = hidden_size self.qk_nope_head_dim = qk_nope_head_dim self.qk_rope_head_dim = qk_rope_head_dim self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim self.v_head_dim = v_head_dim self.q_lora_rank = q_lora_rank self.kv_lora_rank = kv_lora_rank self.num_heads = num_heads tp_size = get_tensor_model_parallel_world_size() assert num_heads % tp_size == 0 self.num_local_heads = num_heads // tp_size self.scaling = self.qk_head_dim**-0.5 self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings if self.q_lora_rank is not None: self.q_a_proj = ReplicatedLinear(self.hidden_size, self.q_lora_rank, bias=False, quant_config=quant_config, prefix=f"{prefix}.q_a_proj") self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps) self.q_b_proj = ColumnParallelLinear(q_lora_rank, self.num_heads * self.qk_head_dim, bias=False, quant_config=quant_config, prefix=f"{prefix}.q_b_proj") else: self.q_proj = ColumnParallelLinear(self.hidden_size, self.num_heads * self.qk_head_dim, bias=False, quant_config=quant_config, prefix=f"{prefix}.q_proj") self.kv_a_proj_with_mqa = ReplicatedLinear( self.hidden_size, self.kv_lora_rank + self.qk_rope_head_dim, bias=False, quant_config=quant_config, prefix=f"{prefix}.kv_a_proj_with_mqa") self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps) self.kv_b_proj = ColumnParallelLinear( self.kv_lora_rank, self.num_heads * (self.qk_nope_head_dim + self.v_head_dim), bias=False, quant_config=quant_config, prefix=f"{prefix}.kv_b_proj") self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim, self.hidden_size, bias=False, quant_config=quant_config, prefix=f"{prefix}.o_proj") if rope_scaling: rope_scaling["rope_type"] = 'deepseek_yarn' self.rotary_emb = get_rope(qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, base=rope_theta, rope_scaling=rope_scaling, is_neox_style=False) if rope_scaling: mscale_all_dim = rope_scaling.get("mscale_all_dim", False) scaling_factor = rope_scaling["factor"] mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.scaling = self.scaling * mscale * mscale # In the MLA backend, kv_cache includes both k_c and # pe (i.e. decoupled position embeddings). In particular, # the concat_and_cache_mla op requires # k_c.size(1) + k_pe.size(1) == kv_cache.size(2) # i.e. # kv_lora_rank + qk_rope_head_dim == head_size self.mla_attn = Attention( num_heads=self.num_local_heads, head_size=self.kv_lora_rank + self.qk_rope_head_dim, scale=self.scaling, num_kv_heads=1, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", use_mla=True, # MLA Args q_lora_rank=self.q_lora_rank, kv_lora_rank=self.kv_lora_rank, qk_nope_head_dim=self.qk_nope_head_dim, qk_rope_head_dim=self.qk_rope_head_dim, qk_head_dim=self.qk_head_dim, v_head_dim=self.v_head_dim, rotary_emb=self.rotary_emb, q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj, kv_a_proj_with_mqa=self.kv_a_proj_with_mqa, kv_a_layernorm=self.kv_a_layernorm, kv_b_proj=self.kv_b_proj, o_proj=self.o_proj, ) self.prefix = prefix self.debug_layer_idx = int(self.prefix.split(".")[-2]) if VLLM_ENABLE_GRAPH_MODE == "1": self.forward = self.forward_torchair else: self.forward = self.forward_eager # type: ignore def forward_torchair(self, positions: torch.Tensor, hidden_states: torch.Tensor, kv_cache: torch.Tensor = None, attn_metadata=None): if self.q_lora_rank is not None: ckq = self.q_a_proj(hidden_states)[0] hidden_states_or_q_c = self.q_a_layernorm(ckq) else: hidden_states_or_q_c = hidden_states return self.mla_attn(hidden_states_or_q_c, hidden_states, None, kv_cache, attn_metadata) def forward_eager(self, positions: torch.Tensor, hidden_states: torch.Tensor): if self.q_lora_rank is not None: ckq = self.q_a_proj(hidden_states)[0] hidden_states_or_q_c = self.q_a_layernorm(ckq) else: hidden_states_or_q_c = hidden_states kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split( [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) return self.mla_attn(hidden_states_or_q_c, kv_c_normed, k_pe, output_shape=hidden_states.shape) # def forward( # self, # positions: torch.Tensor, # hidden_states: torch.Tensor, # # torchair should pass below two parameters # kv_cache: torch.Tensor = None, # attn_metadata: AttentionMetadata = None, # ) -> torch.Tensor: # if self.q_lora_rank is not None: # ckq = self.q_a_proj(hidden_states)[0] # hidden_states_or_q_c = self.q_a_layernorm(ckq) # else: # hidden_states_or_q_c = hidden_states # if VLLM_ENABLE_GRAPH_MODE == '1': # return self.mla_attn(hidden_states_or_q_c, hidden_states, None, # kv_cache, attn_metadata) # else: # kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split( # [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) # kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) # return self.mla_attn(hidden_states_or_q_c, kv_c_normed, k_pe, output_shape=hidden_states.shape) # kv_cache, attn_metadata) class CustomDeepseekV2DecoderLayer(DeepseekV2DecoderLayer): def __init__( self, config: PretrainedConfig, prefix: str, model_config: ModelConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, ) -> None: nn.Module.__init__(self) self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # DecoderLayers are created with `make_layers` which passes the prefix # with the layer's index. layer_idx = int(prefix.split(sep='.')[-1]) self.layer_idx = layer_idx # TODO: enable mla in vllm-ascend if model_config.use_mla: attn_cls = CustomDeepseekV2MLAAttention else: attn_cls = DeepseekV2Attention self.self_attn = attn_cls( config=config, hidden_size=self.hidden_size, num_heads=config.num_attention_heads, qk_nope_head_dim=config.qk_nope_head_dim, qk_rope_head_dim=config.qk_rope_head_dim, v_head_dim=config.v_head_dim, q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None, kv_lora_rank=config.kv_lora_rank, rope_theta=rope_theta, rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", ) if (config.n_routed_experts is not None and layer_idx >= config.first_k_dense_replace and layer_idx % config.moe_layer_freq == 0): self.mlp = CustomDeepseekV2MoE( config=config, quant_config=quant_config, prefix=f"{prefix}.mlp", ) else: self.mlp = DeepseekV2MLP( hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, prefix=f"{prefix}.mlp", ) self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.routed_scaling_factor = config.routed_scaling_factor class CustomDeepseekV2Model(nn.Module): fall_back_to_pt_during_load = False def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size if get_pp_group().is_first_rank: self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, quant_config=quant_config, prefix=f"{prefix}.embed_tokens") else: self.embed_tokens = PPMissingLayer() self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: CustomDeepseekV2DecoderLayer( config, prefix, model_config=model_config, cache_config=cache_config, quant_config=quant_config, ), prefix=f"{prefix}.layers") if get_pp_group().is_last_rank: self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) else: self.norm = PPMissingLayer() self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.embed_tokens(input_ids) def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors], inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: if inputs_embeds is not None: hidden_states = inputs_embeds else: hidden_states = self.get_input_embeddings(input_ids) residual = None else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] for layer in self.layers[self.start_layer:self.end_layer]: hidden_states, residual = layer(positions, hidden_states, residual) if not get_pp_group().is_last_rank: return IntermediateTensors({ "hidden_states": hidden_states, "residual": residual }) hidden_states, _ = self.norm(hidden_states, residual) return hidden_states class CustomDeepseekV2ForCausalLM(DeepseekV2ForCausalLM): # add `packed_modules_mapping` in `DeepseekV2ForCausalLM` to support weight merging packed_modules_mapping = { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"] } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): nn.Module.__init__(self) config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config self.model = CustomDeepseekV2Model(vllm_config=vllm_config, prefix=maybe_prefix( prefix, "model")) if get_pp_group().is_last_rank: self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) else: self.lm_head = PPMissingLayer() self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM): pass