From 9cee025f41a7a362afcc22c1e1c4549793007d5a Mon Sep 17 00:00:00 2001 From: Li Wei <52344829+liwei109@users.noreply.github.com> Date: Mon, 29 Dec 2025 19:56:24 +0800 Subject: [PATCH] Merge pull request #59 from liwei109/aicapx-quant [fix]remove weight_loader_v2 to suport cuda graph --- vllm_kunlun/ops/__init__.py | 1 + vllm_kunlun/ops/linear.py | 43 +++++++++++++++++++++++-------------- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/vllm_kunlun/ops/__init__.py b/vllm_kunlun/ops/__init__.py index f11be00..042d1f0 100644 --- a/vllm_kunlun/ops/__init__.py +++ b/vllm_kunlun/ops/__init__.py @@ -15,6 +15,7 @@ # This file is a part of the vllm-ascend project. # +# import vllm_kunlun.ops.linear import vllm_kunlun.ops.rotary_embedding import vllm_kunlun.ops.layernorm import vllm_kunlun.ops.quantization.awq diff --git a/vllm_kunlun/ops/linear.py b/vllm_kunlun/ops/linear.py index 738d326..db854b8 100644 --- a/vllm_kunlun/ops/linear.py +++ b/vllm_kunlun/ops/linear.py @@ -4,27 +4,36 @@ import torch import torch.nn as nn from torch.nn.parameter import Parameter -from vllm.model_executor.layers.linear import ReplicatedLinear as VllmReplicatedLinear -from vllm.model_executor.layers.linear import UnquantizedLinearMethod +from vllm.model_executor.layers.linear import ( + WEIGHT_LOADER_V2_SUPPORTED, + ReplicatedLinear, + UnquantizedLinearMethod, +) from vllm.model_executor.utils import set_weight_attrs +from vllm.model_executor.parameter import ModelWeightParameter +from vllm.logger import init_logger + +logger = init_logger(__name__) -class ReplicatedLinear(VllmReplicatedLinear): - """Replicated linear layer""" - - def get_weights(self): - """get_weights""" - if hasattr(self, "kunlun_linear_weights"): - return self.kunlun_linear_weights - weights = torch.nn.Parameter(self.weight.to(torch.float32)) - self.register_parameter("kunlun_linear_weights", weights) +def get_weights(self): + """get_weights""" + if hasattr(self, "kunlun_linear_weights"): return self.kunlun_linear_weights + weights = torch.nn.Parameter(self.weight.to(torch.float32)) + self.register_parameter("kunlun_linear_weights", weights) + return self.kunlun_linear_weights - def get_weights_half(self): - """get_weights_half""" - if hasattr(self, "kunlun_linear_weights_half"): - return self.kunlun_linear_weights_half - weights = torch.nn.Parameter(self.weight.to(torch.float16)) + +def get_weights_half(self): + """get_weights_half""" + if hasattr(self, "kunlun_linear_weights_half"): + return self.kunlun_linear_weights_half + weights = torch.nn.Parameter(self.weight.to(torch.float16)) + + +ReplicatedLinear.get_weights = get_weights +ReplicatedLinear.get_weights_half = get_weights_half def create_weights( @@ -48,4 +57,6 @@ def create_weights( set_weight_attrs(weight, extra_weight_attrs) +# rewrite create_weights and remove weight_loader_v2 to suport cuda graph UnquantizedLinearMethod.create_weights = create_weights +WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod") \ No newline at end of file