[Model] Support DeepSeek-V4
This commit is contained in:
3
vllm_mlu/lora/layers/__init__.py
Normal file
3
vllm_mlu/lora/layers/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
50
vllm_mlu/lora/layers/base_linear.py
Normal file
50
vllm_mlu/lora/layers/base_linear.py
Normal file
@@ -0,0 +1,50 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.lora.layers.base_linear import BaseLinearLayerWithLoRA
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
|
||||
|
||||
def vllm__lora__layers__row_parallel_linear__BaseLinearLayerWithLoRA__apply(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
bias: torch.Tensor | None,
|
||||
residual: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add residual in matmul
|
||||
'''
|
||||
output = self.base_layer.quant_method.apply(self.base_layer, x, bias, residual)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
# In transformers backend, x and output have extra batch dimension like
|
||||
# (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim),
|
||||
# therefore we need to flatten the batch dimensions.
|
||||
if x.ndim == 3 and output.ndim == 3:
|
||||
output = output.flatten(0, 1)
|
||||
x = x.flatten(0, 1)
|
||||
|
||||
lora_output: torch.Tensor | None = self.punica_wrapper.add_lora_linear(
|
||||
output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0, self.output_slices
|
||||
)
|
||||
if not current_platform.can_update_inplace():
|
||||
output = lora_output
|
||||
|
||||
return output
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(
|
||||
BaseLinearLayerWithLoRA,
|
||||
BaseLinearLayerWithLoRA.apply,
|
||||
vllm__lora__layers__row_parallel_linear__BaseLinearLayerWithLoRA__apply,
|
||||
)
|
||||
39
vllm_mlu/lora/layers/column_parallel_linear.py
Normal file
39
vllm_mlu/lora/layers/column_parallel_linear.py
Normal file
@@ -0,0 +1,39 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.lora.layers.column_parallel_linear import ColumnParallelLinearWithLoRA
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
|
||||
|
||||
vllm__lora__layers__column_parallel_linear__ColumnParallelLinearWithLoRA__forward_org = ColumnParallelLinearWithLoRA.forward
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add smooth_quant_scale and use_tp_weight parameters.
|
||||
'''
|
||||
def vllm__lora__layers__column_parallel_linear__ColumnParallelLinearWithLoRA__forward(
|
||||
self,
|
||||
input_,
|
||||
smooth_quant_scale: torch.Tensor | None = None,
|
||||
use_tp_weight: bool = False,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]:
|
||||
assert not use_tp_weight, "LoRa does not support use_tp_weight yet."
|
||||
assert smooth_quant_scale is None, "LoRA does not support smooth quant yet."
|
||||
return vllm__lora__layers__column_parallel_linear__ColumnParallelLinearWithLoRA__forward_org(self, input_)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(
|
||||
ColumnParallelLinearWithLoRA,
|
||||
ColumnParallelLinearWithLoRA.forward,
|
||||
vllm__lora__layers__column_parallel_linear__ColumnParallelLinearWithLoRA__forward,
|
||||
)
|
||||
163
vllm_mlu/lora/layers/row_parallel_linear.py
Normal file
163
vllm_mlu/lora/layers/row_parallel_linear.py
Normal file
@@ -0,0 +1,163 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.distributed import (
|
||||
split_tensor_along_last_dim,
|
||||
tensor_model_parallel_all_reduce,
|
||||
)
|
||||
from vllm.lora.layers.row_parallel_linear import (
|
||||
RowParallelLinearWithLoRA,
|
||||
RowParallelLinearWithShardedLoRA,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
|
||||
|
||||
def vllm__lora__layers__row_parallel_linear__RowParallelLinearWithShardedLoRA__apply(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
bias: torch.Tensor | None = None,
|
||||
residual: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: add residual and bias in matmul
|
||||
'''
|
||||
output = self.base_layer.quant_method.apply(
|
||||
self.base_layer, x, bias, residual)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
x = x.view(-1, x.shape[-1])
|
||||
output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
|
||||
buffer = torch.zeros(
|
||||
(self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
|
||||
dtype=torch.float32,
|
||||
device=x.device,
|
||||
)
|
||||
|
||||
shrunk_buffer: torch.Tensor | None = self.punica_wrapper.add_shrink(
|
||||
buffer, x, self.lora_a_stacked, 1.0
|
||||
)
|
||||
if not current_platform.can_update_inplace():
|
||||
buffer = shrunk_buffer
|
||||
if self.tp_size > 1:
|
||||
buffer = tensor_model_parallel_all_reduce(buffer)
|
||||
|
||||
# following S-LoRA, allows the fusing of all_gather and all_reduce
|
||||
# by adding the column partitioned lora output to a slice of output
|
||||
# tensor, which is a partial sum due to row parallel. All that
|
||||
# remains is a standard all_reduce. User should be aware though that
|
||||
# the output is not the same as a normal row_parallel, it should be
|
||||
# reduced before being used
|
||||
# NOTE offset are based on the rank.
|
||||
shard_size = self.lora_b_stacked[0].shape[2]
|
||||
offset_start = self.tp_rank * shard_size
|
||||
lora_output: torch.Tensor | None = self.punica_wrapper.add_expand(
|
||||
output,
|
||||
buffer,
|
||||
self.lora_b_stacked,
|
||||
self.output_slices,
|
||||
offset_start=offset_start,
|
||||
add_input=True,
|
||||
)
|
||||
|
||||
if not current_platform.can_update_inplace():
|
||||
output = lora_output
|
||||
|
||||
output = output.view(*out_orig_shape)
|
||||
return output
|
||||
|
||||
|
||||
def vllm__lora__layers__row_parallel_linear__RowParallelLinearWithLoRA__forward(
|
||||
self,
|
||||
input_: torch.Tensor,
|
||||
residual: torch.Tensor | None = None,
|
||||
smooth_quant_scale: torch.Tensor | None = None,
|
||||
use_tp_weight: bool = False,
|
||||
output: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]:
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: Add parameters `residual`, `smooth_quant_scale`, `use_tp_weight` and `output`
|
||||
to keep parameters consistent with RowParallelLinear.forward.
|
||||
'''
|
||||
assert (not use_tp_weight) and output is None, (
|
||||
f"RowParallelLinearWithLoRA.forward does not support use_tp_wight=True"
|
||||
f" or pass output parameters.")
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
# Set up backprop all-reduce.
|
||||
if self.base_layer.input_is_parallel:
|
||||
input_parallel = input_
|
||||
else:
|
||||
# TODO: simplify code below
|
||||
splitted_input = split_tensor_along_last_dim(
|
||||
input_, num_partitions=self.base_layer.tp_size
|
||||
)
|
||||
input_parallel = splitted_input[self.tp_rank].contiguous()
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: 1) apply residual fusion in matmul like RowParallelLinear
|
||||
2) add bias in matmul, not after all reduce
|
||||
'''
|
||||
# Matrix multiply.
|
||||
bias_ = (
|
||||
None if (self.base_layer.tp_rank > 0 or self.base_layer.skip_bias_add)
|
||||
else self.base_layer.bias
|
||||
)
|
||||
residual_ = None if self.base_layer.tp_rank > 0 else residual
|
||||
output_parallel = self.apply(input_parallel, bias_, residual_)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
if self.base_layer.reduce_results and self.tp_size > 1:
|
||||
output = tensor_model_parallel_all_reduce(output_parallel)
|
||||
else:
|
||||
output = output_parallel
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: do not add bias after all_reduce
|
||||
'''
|
||||
output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
if not self.base_layer.return_bias:
|
||||
return output
|
||||
return output, output_bias
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(
|
||||
RowParallelLinearWithShardedLoRA,
|
||||
RowParallelLinearWithShardedLoRA.apply,
|
||||
vllm__lora__layers__row_parallel_linear__RowParallelLinearWithShardedLoRA__apply,
|
||||
)
|
||||
MluHijackObject.apply_hijack(
|
||||
RowParallelLinearWithLoRA,
|
||||
RowParallelLinearWithLoRA.forward,
|
||||
vllm__lora__layers__row_parallel_linear__RowParallelLinearWithLoRA__forward,
|
||||
)
|
||||
Reference in New Issue
Block a user