Further optimize multi-lora inference,LoRA-enabled performance achieves 80%+ of non-LoRA performance (#190)
* optimize lora inference Signed-off-by: wanghao <wanghao@example.com> * further optimize multi-lora inference,LoRA-enabled performance achieves 80%+ of non-LoRA performance Signed-off-by: wanghao <wanghao@example.com> --------- Signed-off-by: wanghao <wanghao@example.com> Co-authored-by: wanghao <wanghao@example.com>
This commit is contained in:
@@ -22,16 +22,11 @@ Punica: Multi-Tenant LoRA Serving.
|
||||
https://arxiv.org/abs/2310.18547
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING, Optional, Union, final
|
||||
|
||||
import torch
|
||||
# Disable torchdynamo for all functions in this file
|
||||
torch._dynamo.config.disable = True
|
||||
|
||||
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from typing import Callable, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
|
||||
|
||||
from vllm_kunlun.lora.ops.kunlun_ops import (
|
||||
bgmv_expand,
|
||||
@@ -42,7 +37,7 @@ from vllm_kunlun.lora.ops.kunlun_ops import (
|
||||
sgmv_shrink,
|
||||
)
|
||||
|
||||
from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
|
||||
# Disable torchdynamo for all functions in this file
|
||||
|
||||
|
||||
# The platforms that are compatible with the PyTorch-native implementation can
|
||||
@@ -545,4 +540,4 @@ class PunicaWrapperKunlun(PunicaWrapperBase):
|
||||
bgmv_shrink(x, lora_a_reshaped, buffer, indices, scale)
|
||||
bgmv_expand(buffer, lora_b_reshaped, y, indices, add_inputs=True)
|
||||
|
||||
y = y.view_as(y_org)
|
||||
y = y.view_as(y_org)
|
||||
|
||||
Reference in New Issue
Block a user