[Model] Support DeepSeek-V4

2026-04-24 09:50:34 +08:00
commit b9925203b8
172 changed files with 44780 additions and 0 deletions
--- a/vllm_mlu/lora/layers/init.py
+++ b/vllm_mlu/lora/layers/init.py
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
--- a/vllm_mlu/lora/layers/base_linear.py
+++ b/vllm_mlu/lora/layers/base_linear.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
+import torch
+
+from vllm.lora.layers.base_linear import BaseLinearLayerWithLoRA
+from vllm.platforms import current_platform
+
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+
+
+def vllm__lora__layers__row_parallel_linear__BaseLinearLayerWithLoRA__apply(
+    self,
+    x: torch.Tensor,
+    bias: torch.Tensor | None,
+    residual: torch.Tensor | None = None,
+) -> torch.Tensor:
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: add residual in matmul
+    '''
+    output = self.base_layer.quant_method.apply(self.base_layer, x, bias, residual)
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+    # In transformers backend, x and output have extra batch dimension like
+    # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim),
+    # therefore we need to flatten the batch dimensions.
+    if x.ndim == 3 and output.ndim == 3:
+        output = output.flatten(0, 1)
+        x = x.flatten(0, 1)
+
+    lora_output: torch.Tensor | None = self.punica_wrapper.add_lora_linear(
+        output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0, self.output_slices
+    )
+    if not current_platform.can_update_inplace():
+        output = lora_output
+
+    return output
+
+
+MluHijackObject.apply_hijack(
+    BaseLinearLayerWithLoRA,
+    BaseLinearLayerWithLoRA.apply,
+    vllm__lora__layers__row_parallel_linear__BaseLinearLayerWithLoRA__apply,
+)
--- a/vllm_mlu/lora/layers/column_parallel_linear.py
+++ b/vllm_mlu/lora/layers/column_parallel_linear.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
+import torch
+
+from vllm.lora.layers.column_parallel_linear import ColumnParallelLinearWithLoRA
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+
+
+vllm__lora__layers__column_parallel_linear__ColumnParallelLinearWithLoRA__forward_org = ColumnParallelLinearWithLoRA.forward
+
+
+'''
+=============================
+Modify by vllm_mlu
+=============================
+@brief: add smooth_quant_scale and use_tp_weight parameters.
+'''
+def vllm__lora__layers__column_parallel_linear__ColumnParallelLinearWithLoRA__forward(
+    self,
+    input_,
+    smooth_quant_scale: torch.Tensor | None = None,
+    use_tp_weight: bool = False,
+) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]:
+    assert not use_tp_weight, "LoRa does not support use_tp_weight yet."
+    assert smooth_quant_scale is None, "LoRA does not support smooth quant yet."
+    return vllm__lora__layers__column_parallel_linear__ColumnParallelLinearWithLoRA__forward_org(self, input_)
+'''
+==================
+End of MLU Hijack
+==================
+'''
+
+
+MluHijackObject.apply_hijack(
+    ColumnParallelLinearWithLoRA,
+    ColumnParallelLinearWithLoRA.forward,
+    vllm__lora__layers__column_parallel_linear__ColumnParallelLinearWithLoRA__forward,
+)
--- a/vllm_mlu/lora/layers/row_parallel_linear.py
+++ b/vllm_mlu/lora/layers/row_parallel_linear.py
@@ -0,0 +1,163 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
+import torch
+
+from vllm.distributed import (
+    split_tensor_along_last_dim,
+    tensor_model_parallel_all_reduce,
+)
+from vllm.lora.layers.row_parallel_linear import (
+    RowParallelLinearWithLoRA,
+    RowParallelLinearWithShardedLoRA,
+)
+from vllm.platforms import current_platform
+
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+
+
+def vllm__lora__layers__row_parallel_linear__RowParallelLinearWithShardedLoRA__apply(
+    self,
+    x: torch.Tensor,
+    bias: torch.Tensor | None = None,
+    residual: torch.Tensor | None = None,
+) -> torch.Tensor:
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: add residual and bias in matmul
+    '''
+    output = self.base_layer.quant_method.apply(
+        self.base_layer, x, bias, residual)
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+
+    x = x.view(-1, x.shape[-1])
+    output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
+    buffer = torch.zeros(
+        (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
+        dtype=torch.float32,
+        device=x.device,
+    )
+
+    shrunk_buffer: torch.Tensor | None = self.punica_wrapper.add_shrink(
+        buffer, x, self.lora_a_stacked, 1.0
+    )
+    if not current_platform.can_update_inplace():
+        buffer = shrunk_buffer
+    if self.tp_size > 1:
+        buffer = tensor_model_parallel_all_reduce(buffer)
+
+    # following S-LoRA, allows the fusing of all_gather and all_reduce
+    # by adding the column partitioned lora output to a slice of output
+    # tensor, which is a partial sum due to row parallel. All that
+    # remains is a standard all_reduce. User should be aware though that
+    # the output is not the same as a normal row_parallel, it should be
+    # reduced before being used
+    # NOTE offset are based on the rank.
+    shard_size = self.lora_b_stacked[0].shape[2]
+    offset_start = self.tp_rank * shard_size
+    lora_output: torch.Tensor | None = self.punica_wrapper.add_expand(
+        output,
+        buffer,
+        self.lora_b_stacked,
+        self.output_slices,
+        offset_start=offset_start,
+        add_input=True,
+    )
+
+    if not current_platform.can_update_inplace():
+        output = lora_output
+
+    output = output.view(*out_orig_shape)
+    return output
+
+
+def vllm__lora__layers__row_parallel_linear__RowParallelLinearWithLoRA__forward(
+    self,
+    input_: torch.Tensor,
+    residual: torch.Tensor | None = None,
+    smooth_quant_scale: torch.Tensor | None = None,
+    use_tp_weight: bool = False,
+    output: torch.Tensor | None = None,
+) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]:
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: Add parameters `residual`, `smooth_quant_scale`, `use_tp_weight` and `output`
+    to keep parameters consistent with RowParallelLinear.forward.
+    '''
+    assert (not use_tp_weight) and output is None, (
+        f"RowParallelLinearWithLoRA.forward does not support use_tp_wight=True"
+        f" or pass output parameters.")
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+    # Set up backprop all-reduce.
+    if self.base_layer.input_is_parallel:
+        input_parallel = input_
+    else:
+        # TODO: simplify code below
+        splitted_input = split_tensor_along_last_dim(
+            input_, num_partitions=self.base_layer.tp_size
+        )
+        input_parallel = splitted_input[self.tp_rank].contiguous()
+
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: 1) apply residual fusion in matmul like RowParallelLinear
+    2) add bias in matmul, not after all reduce
+    '''
+    # Matrix multiply.
+    bias_ = (
+        None if (self.base_layer.tp_rank > 0 or self.base_layer.skip_bias_add)
+        else self.base_layer.bias
+    )
+    residual_ = None if self.base_layer.tp_rank > 0 else residual
+    output_parallel = self.apply(input_parallel, bias_, residual_)
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+    if self.base_layer.reduce_results and self.tp_size > 1:
+        output = tensor_model_parallel_all_reduce(output_parallel)
+    else:
+        output = output_parallel
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: do not add bias after all_reduce
+    '''
+    output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+
+    if not self.base_layer.return_bias:
+        return output
+    return output, output_bias
+
+
+MluHijackObject.apply_hijack(
+    RowParallelLinearWithShardedLoRA,
+    RowParallelLinearWithShardedLoRA.apply,
+    vllm__lora__layers__row_parallel_linear__RowParallelLinearWithShardedLoRA__apply,
+)
+MluHijackObject.apply_hijack(
+    RowParallelLinearWithLoRA,
+    RowParallelLinearWithLoRA.forward,
+    vllm__lora__layers__row_parallel_linear__RowParallelLinearWithLoRA__forward,
+)