xc-llm-ascend/vllm_ascend/lora/utils.py

from typing import Optional

import vllm
from torch import nn
from transformers import PretrainedConfig
from vllm.config import LoRAConfig
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
                              MergedColumnParallelLinearWithLoRA,
                              RowParallelLinearWithLoRA,
                              VocabParallelEmbeddingWithLoRA)

from vllm_ascend.ops.linear import (AscendColumnParallelLinear,
                                    AscendMergedColumnParallelLinear,
                                    AscendRowParallelLinear)
from vllm_ascend.ops.vocab_parallel_embedding import \
    AscendVocabParallelEmbedding


class AscendColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):

    @classmethod
    def can_replace_layer(
        cls,
        source_layer: nn.Module,
        lora_config: LoRAConfig,
        packed_modules_list: list,
        model_config: Optional[PretrainedConfig],
    ) -> bool:
        return type(source_layer) is AscendColumnParallelLinear


class AscendMergedColumnParallelLinearWithLoRA(
        MergedColumnParallelLinearWithLoRA):

    @classmethod
    def can_replace_layer(
        cls,
        source_layer: nn.Module,
        lora_config: LoRAConfig,
        packed_modules_list: list,
        model_config: Optional[PretrainedConfig],
    ) -> bool:
        return type(source_layer) is AscendMergedColumnParallelLinear


class AscendRowParallelLinearWithLoRA(RowParallelLinearWithLoRA):

    @classmethod
    def can_replace_layer(
        cls,
        source_layer: nn.Module,
        lora_config: LoRAConfig,
        packed_modules_list: list,
        model_config: Optional[PretrainedConfig],
    ) -> bool:
        return type(source_layer) is AscendRowParallelLinear


class AscendVocabParallelEmbeddingWithLoRA(VocabParallelEmbeddingWithLoRA):

    @classmethod
    def can_replace_layer(
        cls,
        source_layer: nn.Module,
        lora_config: LoRAConfig,
        packed_modules_list: list,
        model_config: Optional[PretrainedConfig],
    ) -> bool:
        return type(source_layer) is AscendVocabParallelEmbedding


def refresh_all_lora_classes():
    vllm.lora.utils._all_lora_classes.add(AscendColumnParallelLinearWithLoRA)
    vllm.lora.utils._all_lora_classes.add(
        AscendMergedColumnParallelLinearWithLoRA)
    vllm.lora.utils._all_lora_classes.add(AscendRowParallelLinearWithLoRA)
    vllm.lora.utils._all_lora_classes.add(AscendVocabParallelEmbeddingWithLoRA)
[feat]: oproj tensor parallelism in pure DP and graph-mode scenarios. (#2167) ### What this PR does / why we need it? This PR introduces Oproj matrix tensor model parallel to achieve decreasing of memory consumption. It only support graph mode in pure DP scenario. In deepseek r1 w8a8 PD disagregated Decode instance, using pure DP, with oproj_tensor_parallel_size = 8, we have 1 ms TPOT increasing, saved 5.8 GB NPU memory per RANK. We got best performance when oproj_tensor_parallel_size=4 without TPOT increasing. performance data: <img width="1442" height="442" alt="image" src="https://github.com/user-attachments/assets/83270fc5-868a-4387-b0a9-fac29b4a376d" /> ### Does this PR introduce _any_ user-facing change? This PR introduces one new config in `additional_config`. \| Name \| Effect \| Required \| Type \| Constraints \| \| :---------------------------- \| :--------------------------------------- \| :------- \| :--- \| :----------------- \| \| oproj_tensor_parallel_size \| Split the o_proj matrix along the row dimension (head num * head dim) into oproj_tensor_parallel_size pieces. \| No \| int \| default value is None, once this value is set, the feature will be enabled, head num * head dim must be divisible by this value. \| example `--additional_config={"oproj_tensor_parallel_size": 8}` ### How was this patch tested? - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/eddaafc1c77b0690194cbd1b73747d572793838c --------- Signed-off-by: zzhx1 <zzh_201018@outlook.com> Co-authored-by: zzh <zzh_201018@outlook.com> 2025-09-07 10:31:32 +08:00			`from typing import Optional`

[Misc] Move lora patch file into lora module (#2797) Cleanup useless file in patch module. Update the lora support list is OK in vLLM Ascend, no need to patch vLLM - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/f4962a6d55a340ebb569d377c842deff7611d8f7 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-09-08 21:42:12 +08:00			`import vllm`
[feat]: oproj tensor parallelism in pure DP and graph-mode scenarios. (#2167) ### What this PR does / why we need it? This PR introduces Oproj matrix tensor model parallel to achieve decreasing of memory consumption. It only support graph mode in pure DP scenario. In deepseek r1 w8a8 PD disagregated Decode instance, using pure DP, with oproj_tensor_parallel_size = 8, we have 1 ms TPOT increasing, saved 5.8 GB NPU memory per RANK. We got best performance when oproj_tensor_parallel_size=4 without TPOT increasing. performance data: <img width="1442" height="442" alt="image" src="https://github.com/user-attachments/assets/83270fc5-868a-4387-b0a9-fac29b4a376d" /> ### Does this PR introduce _any_ user-facing change? This PR introduces one new config in `additional_config`. \| Name \| Effect \| Required \| Type \| Constraints \| \| :---------------------------- \| :--------------------------------------- \| :------- \| :--- \| :----------------- \| \| oproj_tensor_parallel_size \| Split the o_proj matrix along the row dimension (head num * head dim) into oproj_tensor_parallel_size pieces. \| No \| int \| default value is None, once this value is set, the feature will be enabled, head num * head dim must be divisible by this value. \| example `--additional_config={"oproj_tensor_parallel_size": 8}` ### How was this patch tested? - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/eddaafc1c77b0690194cbd1b73747d572793838c --------- Signed-off-by: zzhx1 <zzh_201018@outlook.com> Co-authored-by: zzh <zzh_201018@outlook.com> 2025-09-07 10:31:32 +08:00			`from torch import nn`
			`from transformers import PretrainedConfig`
			`from vllm.config import LoRAConfig`
			`from vllm.lora.layers import (ColumnParallelLinearWithLoRA,`
			`MergedColumnParallelLinearWithLoRA,`
[Misc] Move lora patch file into lora module (#2797) Cleanup useless file in patch module. Update the lora support list is OK in vLLM Ascend, no need to patch vLLM - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/f4962a6d55a340ebb569d377c842deff7611d8f7 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-09-08 21:42:12 +08:00			`RowParallelLinearWithLoRA,`
			`VocabParallelEmbeddingWithLoRA)`
[feat]: oproj tensor parallelism in pure DP and graph-mode scenarios. (#2167) ### What this PR does / why we need it? This PR introduces Oproj matrix tensor model parallel to achieve decreasing of memory consumption. It only support graph mode in pure DP scenario. In deepseek r1 w8a8 PD disagregated Decode instance, using pure DP, with oproj_tensor_parallel_size = 8, we have 1 ms TPOT increasing, saved 5.8 GB NPU memory per RANK. We got best performance when oproj_tensor_parallel_size=4 without TPOT increasing. performance data: <img width="1442" height="442" alt="image" src="https://github.com/user-attachments/assets/83270fc5-868a-4387-b0a9-fac29b4a376d" /> ### Does this PR introduce _any_ user-facing change? This PR introduces one new config in `additional_config`. \| Name \| Effect \| Required \| Type \| Constraints \| \| :---------------------------- \| :--------------------------------------- \| :------- \| :--- \| :----------------- \| \| oproj_tensor_parallel_size \| Split the o_proj matrix along the row dimension (head num * head dim) into oproj_tensor_parallel_size pieces. \| No \| int \| default value is None, once this value is set, the feature will be enabled, head num * head dim must be divisible by this value. \| example `--additional_config={"oproj_tensor_parallel_size": 8}` ### How was this patch tested? - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/eddaafc1c77b0690194cbd1b73747d572793838c --------- Signed-off-by: zzhx1 <zzh_201018@outlook.com> Co-authored-by: zzh <zzh_201018@outlook.com> 2025-09-07 10:31:32 +08:00
			`from vllm_ascend.ops.linear import (AscendColumnParallelLinear,`
			`AscendMergedColumnParallelLinear,`
			`AscendRowParallelLinear)`
[Misc] Move lora patch file into lora module (#2797) Cleanup useless file in patch module. Update the lora support list is OK in vLLM Ascend, no need to patch vLLM - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/f4962a6d55a340ebb569d377c842deff7611d8f7 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-09-08 21:42:12 +08:00			`from vllm_ascend.ops.vocab_parallel_embedding import \`
			`AscendVocabParallelEmbedding`
[feat]: oproj tensor parallelism in pure DP and graph-mode scenarios. (#2167) ### What this PR does / why we need it? This PR introduces Oproj matrix tensor model parallel to achieve decreasing of memory consumption. It only support graph mode in pure DP scenario. In deepseek r1 w8a8 PD disagregated Decode instance, using pure DP, with oproj_tensor_parallel_size = 8, we have 1 ms TPOT increasing, saved 5.8 GB NPU memory per RANK. We got best performance when oproj_tensor_parallel_size=4 without TPOT increasing. performance data: <img width="1442" height="442" alt="image" src="https://github.com/user-attachments/assets/83270fc5-868a-4387-b0a9-fac29b4a376d" /> ### Does this PR introduce _any_ user-facing change? This PR introduces one new config in `additional_config`. \| Name \| Effect \| Required \| Type \| Constraints \| \| :---------------------------- \| :--------------------------------------- \| :------- \| :--- \| :----------------- \| \| oproj_tensor_parallel_size \| Split the o_proj matrix along the row dimension (head num * head dim) into oproj_tensor_parallel_size pieces. \| No \| int \| default value is None, once this value is set, the feature will be enabled, head num * head dim must be divisible by this value. \| example `--additional_config={"oproj_tensor_parallel_size": 8}` ### How was this patch tested? - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/eddaafc1c77b0690194cbd1b73747d572793838c --------- Signed-off-by: zzhx1 <zzh_201018@outlook.com> Co-authored-by: zzh <zzh_201018@outlook.com> 2025-09-07 10:31:32 +08:00

[Misc] Move lora patch file into lora module (#2797) Cleanup useless file in patch module. Update the lora support list is OK in vLLM Ascend, no need to patch vLLM - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/f4962a6d55a340ebb569d377c842deff7611d8f7 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-09-08 21:42:12 +08:00			`class AscendColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):`
[feat]: oproj tensor parallelism in pure DP and graph-mode scenarios. (#2167) ### What this PR does / why we need it? This PR introduces Oproj matrix tensor model parallel to achieve decreasing of memory consumption. It only support graph mode in pure DP scenario. In deepseek r1 w8a8 PD disagregated Decode instance, using pure DP, with oproj_tensor_parallel_size = 8, we have 1 ms TPOT increasing, saved 5.8 GB NPU memory per RANK. We got best performance when oproj_tensor_parallel_size=4 without TPOT increasing. performance data: <img width="1442" height="442" alt="image" src="https://github.com/user-attachments/assets/83270fc5-868a-4387-b0a9-fac29b4a376d" /> ### Does this PR introduce _any_ user-facing change? This PR introduces one new config in `additional_config`. \| Name \| Effect \| Required \| Type \| Constraints \| \| :---------------------------- \| :--------------------------------------- \| :------- \| :--- \| :----------------- \| \| oproj_tensor_parallel_size \| Split the o_proj matrix along the row dimension (head num * head dim) into oproj_tensor_parallel_size pieces. \| No \| int \| default value is None, once this value is set, the feature will be enabled, head num * head dim must be divisible by this value. \| example `--additional_config={"oproj_tensor_parallel_size": 8}` ### How was this patch tested? - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/eddaafc1c77b0690194cbd1b73747d572793838c --------- Signed-off-by: zzhx1 <zzh_201018@outlook.com> Co-authored-by: zzh <zzh_201018@outlook.com> 2025-09-07 10:31:32 +08:00
			`@classmethod`
			`def can_replace_layer(`
			`cls,`
			`source_layer: nn.Module,`
			`lora_config: LoRAConfig,`
			`packed_modules_list: list,`
			`model_config: Optional[PretrainedConfig],`
			`) -> bool:`
[Misc] Move lora patch file into lora module (#2797) Cleanup useless file in patch module. Update the lora support list is OK in vLLM Ascend, no need to patch vLLM - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/f4962a6d55a340ebb569d377c842deff7611d8f7 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-09-08 21:42:12 +08:00			`return type(source_layer) is AscendColumnParallelLinear`
[feat]: oproj tensor parallelism in pure DP and graph-mode scenarios. (#2167) ### What this PR does / why we need it? This PR introduces Oproj matrix tensor model parallel to achieve decreasing of memory consumption. It only support graph mode in pure DP scenario. In deepseek r1 w8a8 PD disagregated Decode instance, using pure DP, with oproj_tensor_parallel_size = 8, we have 1 ms TPOT increasing, saved 5.8 GB NPU memory per RANK. We got best performance when oproj_tensor_parallel_size=4 without TPOT increasing. performance data: <img width="1442" height="442" alt="image" src="https://github.com/user-attachments/assets/83270fc5-868a-4387-b0a9-fac29b4a376d" /> ### Does this PR introduce _any_ user-facing change? This PR introduces one new config in `additional_config`. \| Name \| Effect \| Required \| Type \| Constraints \| \| :---------------------------- \| :--------------------------------------- \| :------- \| :--- \| :----------------- \| \| oproj_tensor_parallel_size \| Split the o_proj matrix along the row dimension (head num * head dim) into oproj_tensor_parallel_size pieces. \| No \| int \| default value is None, once this value is set, the feature will be enabled, head num * head dim must be divisible by this value. \| example `--additional_config={"oproj_tensor_parallel_size": 8}` ### How was this patch tested? - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/eddaafc1c77b0690194cbd1b73747d572793838c --------- Signed-off-by: zzhx1 <zzh_201018@outlook.com> Co-authored-by: zzh <zzh_201018@outlook.com> 2025-09-07 10:31:32 +08:00

[Misc] Move lora patch file into lora module (#2797) Cleanup useless file in patch module. Update the lora support list is OK in vLLM Ascend, no need to patch vLLM - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/f4962a6d55a340ebb569d377c842deff7611d8f7 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-09-08 21:42:12 +08:00			`class AscendMergedColumnParallelLinearWithLoRA(`
			`MergedColumnParallelLinearWithLoRA):`
[feat]: oproj tensor parallelism in pure DP and graph-mode scenarios. (#2167) ### What this PR does / why we need it? This PR introduces Oproj matrix tensor model parallel to achieve decreasing of memory consumption. It only support graph mode in pure DP scenario. In deepseek r1 w8a8 PD disagregated Decode instance, using pure DP, with oproj_tensor_parallel_size = 8, we have 1 ms TPOT increasing, saved 5.8 GB NPU memory per RANK. We got best performance when oproj_tensor_parallel_size=4 without TPOT increasing. performance data: <img width="1442" height="442" alt="image" src="https://github.com/user-attachments/assets/83270fc5-868a-4387-b0a9-fac29b4a376d" /> ### Does this PR introduce _any_ user-facing change? This PR introduces one new config in `additional_config`. \| Name \| Effect \| Required \| Type \| Constraints \| \| :---------------------------- \| :--------------------------------------- \| :------- \| :--- \| :----------------- \| \| oproj_tensor_parallel_size \| Split the o_proj matrix along the row dimension (head num * head dim) into oproj_tensor_parallel_size pieces. \| No \| int \| default value is None, once this value is set, the feature will be enabled, head num * head dim must be divisible by this value. \| example `--additional_config={"oproj_tensor_parallel_size": 8}` ### How was this patch tested? - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/eddaafc1c77b0690194cbd1b73747d572793838c --------- Signed-off-by: zzhx1 <zzh_201018@outlook.com> Co-authored-by: zzh <zzh_201018@outlook.com> 2025-09-07 10:31:32 +08:00
			`@classmethod`
			`def can_replace_layer(`
			`cls,`
			`source_layer: nn.Module,`
			`lora_config: LoRAConfig,`
			`packed_modules_list: list,`
			`model_config: Optional[PretrainedConfig],`
			`) -> bool:`
[Misc] Move lora patch file into lora module (#2797) Cleanup useless file in patch module. Update the lora support list is OK in vLLM Ascend, no need to patch vLLM - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/f4962a6d55a340ebb569d377c842deff7611d8f7 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-09-08 21:42:12 +08:00			`return type(source_layer) is AscendMergedColumnParallelLinear`
[feat]: oproj tensor parallelism in pure DP and graph-mode scenarios. (#2167) ### What this PR does / why we need it? This PR introduces Oproj matrix tensor model parallel to achieve decreasing of memory consumption. It only support graph mode in pure DP scenario. In deepseek r1 w8a8 PD disagregated Decode instance, using pure DP, with oproj_tensor_parallel_size = 8, we have 1 ms TPOT increasing, saved 5.8 GB NPU memory per RANK. We got best performance when oproj_tensor_parallel_size=4 without TPOT increasing. performance data: <img width="1442" height="442" alt="image" src="https://github.com/user-attachments/assets/83270fc5-868a-4387-b0a9-fac29b4a376d" /> ### Does this PR introduce _any_ user-facing change? This PR introduces one new config in `additional_config`. \| Name \| Effect \| Required \| Type \| Constraints \| \| :---------------------------- \| :--------------------------------------- \| :------- \| :--- \| :----------------- \| \| oproj_tensor_parallel_size \| Split the o_proj matrix along the row dimension (head num * head dim) into oproj_tensor_parallel_size pieces. \| No \| int \| default value is None, once this value is set, the feature will be enabled, head num * head dim must be divisible by this value. \| example `--additional_config={"oproj_tensor_parallel_size": 8}` ### How was this patch tested? - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/eddaafc1c77b0690194cbd1b73747d572793838c --------- Signed-off-by: zzhx1 <zzh_201018@outlook.com> Co-authored-by: zzh <zzh_201018@outlook.com> 2025-09-07 10:31:32 +08:00

[Misc] Move lora patch file into lora module (#2797) Cleanup useless file in patch module. Update the lora support list is OK in vLLM Ascend, no need to patch vLLM - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/f4962a6d55a340ebb569d377c842deff7611d8f7 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-09-08 21:42:12 +08:00			`class AscendRowParallelLinearWithLoRA(RowParallelLinearWithLoRA):`
[feat]: oproj tensor parallelism in pure DP and graph-mode scenarios. (#2167) ### What this PR does / why we need it? This PR introduces Oproj matrix tensor model parallel to achieve decreasing of memory consumption. It only support graph mode in pure DP scenario. In deepseek r1 w8a8 PD disagregated Decode instance, using pure DP, with oproj_tensor_parallel_size = 8, we have 1 ms TPOT increasing, saved 5.8 GB NPU memory per RANK. We got best performance when oproj_tensor_parallel_size=4 without TPOT increasing. performance data: <img width="1442" height="442" alt="image" src="https://github.com/user-attachments/assets/83270fc5-868a-4387-b0a9-fac29b4a376d" /> ### Does this PR introduce _any_ user-facing change? This PR introduces one new config in `additional_config`. \| Name \| Effect \| Required \| Type \| Constraints \| \| :---------------------------- \| :--------------------------------------- \| :------- \| :--- \| :----------------- \| \| oproj_tensor_parallel_size \| Split the o_proj matrix along the row dimension (head num * head dim) into oproj_tensor_parallel_size pieces. \| No \| int \| default value is None, once this value is set, the feature will be enabled, head num * head dim must be divisible by this value. \| example `--additional_config={"oproj_tensor_parallel_size": 8}` ### How was this patch tested? - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/eddaafc1c77b0690194cbd1b73747d572793838c --------- Signed-off-by: zzhx1 <zzh_201018@outlook.com> Co-authored-by: zzh <zzh_201018@outlook.com> 2025-09-07 10:31:32 +08:00
			`@classmethod`
			`def can_replace_layer(`
			`cls,`
			`source_layer: nn.Module,`
			`lora_config: LoRAConfig,`
			`packed_modules_list: list,`
			`model_config: Optional[PretrainedConfig],`
			`) -> bool:`
[Misc] Move lora patch file into lora module (#2797) Cleanup useless file in patch module. Update the lora support list is OK in vLLM Ascend, no need to patch vLLM - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/f4962a6d55a340ebb569d377c842deff7611d8f7 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-09-08 21:42:12 +08:00			`return type(source_layer) is AscendRowParallelLinear`


			`class AscendVocabParallelEmbeddingWithLoRA(VocabParallelEmbeddingWithLoRA):`

			`@classmethod`
			`def can_replace_layer(`
			`cls,`
			`source_layer: nn.Module,`
			`lora_config: LoRAConfig,`
			`packed_modules_list: list,`
			`model_config: Optional[PretrainedConfig],`
			`) -> bool:`
			`return type(source_layer) is AscendVocabParallelEmbedding`


			`def refresh_all_lora_classes():`
			`vllm.lora.utils._all_lora_classes.add(AscendColumnParallelLinearWithLoRA)`
			`vllm.lora.utils._all_lora_classes.add(`
			`AscendMergedColumnParallelLinearWithLoRA)`
			`vllm.lora.utils._all_lora_classes.add(AscendRowParallelLinearWithLoRA)`
			`vllm.lora.utils._all_lora_classes.add(AscendVocabParallelEmbeddingWithLoRA)`