From 9caf6fbaf59c74a8600c75e6b9af36f8b0e8890b Mon Sep 17 00:00:00 2001
From: yupeng <507435917@qq.com>
Date: Fri, 26 Sep 2025 11:12:45 +0800
Subject: [PATCH] [Bugfix][LoRA] Fix LoRA bug after supporting Qwen3-Next
 (#3044)

### What this PR does / why we need it?
LoRA e2e test uses ilama-3.2-1B model. It uses transformers.py model
files. Its self-attention layer names end with "\*.attn", not
"\*.self_attn".

There are some other model attention layer names end with "*.attn", such
as baichuan.py, bert.py.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
pytest -sv tests/e2e/singlecard/test_ilama_lora.py
pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py

- vLLM version: v0.10.2
- vLLM main:
https://github.com/vllm-project/vllm/commit/17b4c6685ce62d5652654784d6771a3d38e4273e

---------

Signed-off-by: paulyu12 <507435917@qq.com>
---
 .github/workflows/_e2e_test.yaml |  4 ++--
 vllm_ascend/lora/utils.py        | 33 ++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml
index 7f8df34..3a4f3df 100644
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -92,7 +92,7 @@ jobs:
           pytest -sv tests/e2e/singlecard/test_chunked.py
           pytest -sv tests/e2e/singlecard/test_embedding.py
           pytest -sv tests/e2e/singlecard/test_guided_decoding.py
-          #pytest -sv tests/e2e/singlecard/test_ilama_lora.py
+          pytest -sv tests/e2e/singlecard/test_ilama_lora.py
           pytest -sv tests/e2e/singlecard/test_profile_execute_duration.py
           pytest -sv tests/e2e/singlecard/test_quantization.py
           pytest -sv tests/e2e/singlecard/test_sampler.py
@@ -174,7 +174,7 @@ jobs:
           # external_launcher test is not stable enough. Fix it later
           # pytest -sv tests/e2e/multicard/test_external_launcher.py
           pytest -sv tests/e2e/multicard/test_fused_moe_allgather_ep.py
-          #pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
+          pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
 
           # To avoid oom, we need to run the test in a single process.
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
diff --git a/vllm_ascend/lora/utils.py b/vllm_ascend/lora/utils.py
index 47e95cd..be4fbeb 100644
--- a/vllm_ascend/lora/utils.py
+++ b/vllm_ascend/lora/utils.py
@@ -6,11 +6,15 @@ from transformers import PretrainedConfig
 from vllm.config import LoRAConfig
 from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
                               MergedColumnParallelLinearWithLoRA,
+                              MergedQKVParallelLinearWithLoRA,
+                              QKVParallelLinearWithLoRA,
                               RowParallelLinearWithLoRA,
                               VocabParallelEmbeddingWithLoRA)
+from vllm.lora.layers.utils import _not_fully_sharded_can_replace
 
 from vllm_ascend.ops.linear import (AscendColumnParallelLinear,
                                     AscendMergedColumnParallelLinear,
+                                    AscendQKVParallelLinear,
                                     AscendRowParallelLinear)
 from vllm_ascend.ops.vocab_parallel_embedding import \
     AscendVocabParallelEmbedding
@@ -69,9 +73,38 @@ class AscendVocabParallelEmbeddingWithLoRA(VocabParallelEmbeddingWithLoRA):
         return type(source_layer) is AscendVocabParallelEmbedding
 
 
+class AscendQKVParallelLinearWithLoRA(QKVParallelLinearWithLoRA):
+
+    @classmethod
+    @_not_fully_sharded_can_replace
+    def can_replace_layer(cls, source_layer: nn.Module,
+                          lora_config: LoRAConfig, packed_modules_list: list,
+                          model_config: Optional[PretrainedConfig]) -> bool:
+        return type(source_layer) is AscendQKVParallelLinear and len(
+            packed_modules_list) == 1
+
+
+class AscendMergedQKVParallelLinearWithLoRA(MergedQKVParallelLinearWithLoRA):
+
+    @classmethod
+    @_not_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return (type(source_layer) is AscendQKVParallelLinear
+                and len(packed_modules_list) == 3)
+
+
 def refresh_all_lora_classes():
     vllm.lora.utils._all_lora_classes.add(AscendColumnParallelLinearWithLoRA)
     vllm.lora.utils._all_lora_classes.add(
         AscendMergedColumnParallelLinearWithLoRA)
     vllm.lora.utils._all_lora_classes.add(AscendRowParallelLinearWithLoRA)
     vllm.lora.utils._all_lora_classes.add(AscendVocabParallelEmbeddingWithLoRA)
+    vllm.lora.utils._all_lora_classes.add(AscendQKVParallelLinearWithLoRA)
+    vllm.lora.utils._all_lora_classes.add(
+        AscendMergedQKVParallelLinearWithLoRA)