Further optimize multi-lora inference,LoRA-enabled performance achieves 80%+ of non-LoRA performance (#190)

* optimize lora inference Signed-off-by: wanghao <wanghao@example.com> * further optimize multi-lora inference,LoRA-enabled performance achieves 80%+ of non-LoRA performance Signed-off-by: wanghao <wanghao@example.com> --------- Signed-off-by: wanghao <wanghao@example.com> Co-authored-by: wanghao <wanghao@example.com>
2026-02-11 12:04:14 +08:00
parent 9b1f25fbe3
commit bd8c999335
3 changed files with 601 additions and 127 deletions
--- a/vllm_kunlun/lora/punica_wrapper/punica_kunlun.py
+++ b/vllm_kunlun/lora/punica_wrapper/punica_kunlun.py
@@ -22,16 +22,11 @@ Punica: Multi-Tenant LoRA Serving.
 https://arxiv.org/abs/2310.18547
 """

-from typing import TYPE_CHECKING, Optional, Union, final
-
-import torch
-# Disable torchdynamo for all functions in this file
-torch._dynamo.config.disable = True
-
-
 # SPDX-License-Identifier: Apache-2.0
 from typing import Callable, Optional, Tuple, Union

+import torch
+from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase

 from vllm_kunlun.lora.ops.kunlun_ops import (
    bgmv_expand,
@@ -42,7 +37,7 @@ from vllm_kunlun.lora.ops.kunlun_ops import (
    sgmv_shrink,
 )

-from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
+# Disable torchdynamo for all functions in this file


 # The platforms that are compatible with the PyTorch-native implementation can
@@ -545,4 +540,4 @@ class PunicaWrapperKunlun(PunicaWrapperBase):
        bgmv_shrink(x, lora_a_reshaped, buffer, indices, scale)
        bgmv_expand(buffer, lora_b_reshaped, y, indices, add_inputs=True)

-        y = y.view_as(y_org)
+        y = y.view_as(y_org)