From cd1162e25ab9f30ba9a7eaf31bdbeab13bca2c89 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Tue, 6 Jan 2026 20:17:32 +0800 Subject: [PATCH] [Misc] Remove useless weight loader patch (#5619) The patch for weight loader is useless now. Let's remove it - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/8be6432bdaf6275664d857b1e5e9bf8ed1ce299e Signed-off-by: wangxiyuan --- vllm_ascend/patch/__init__.py | 22 +++------- vllm_ascend/patch/worker/__init__.py | 1 - .../patch/worker/patch_weight_loader.py | 41 ------------------- 3 files changed, 5 insertions(+), 59 deletions(-) delete mode 100644 vllm_ascend/patch/worker/patch_weight_loader.py diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index 84c36a34..a1037855 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -184,19 +184,7 @@ # Future Plan: # Remove this patch when vLLM support the dispatch function. # -# ** 7. File: worker/patch_weight_loader.py** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.model_executor.layers.linear.UnquantizedLinearMethod` -# Why: -# vLLM Ascend doesn't work with weight loader v2 -# How: -# patch it to fix the bug. -# Related PR (if no, explain why): -# This is a bug by Ascend only. We should fix it soon -# Future Plan: -# Remove this patch when the bug is fixed. -# -# ** 8. File: worker/patch_qwen3_next_mtp.py** +# ** 7. File: worker/patch_qwen3_next_mtp.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.worker.utils.bind_kv_cache` # Why: @@ -209,7 +197,7 @@ # Future Plan: # Remove this patch after discussing with vllm community and adapting bind_kv_cache to npu. # -# ** 9. File: worker/patch_module.py** +# ** 8. File: worker/patch_module.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.attention.backends.gdn_attn.torch.argsort` # Why: @@ -225,7 +213,7 @@ # Remove this patch when bool is supported in 'torch.argsort' func of npu. # Make 'torch.argsort' in `vllm.v1.attention.backends.gdn_attn` be stable. # -# ** 10. File: worker/patch_rejection_sampler.py** +# ** 9. File: worker/patch_rejection_sampler.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.sample.rejection_sampler` # Why: @@ -241,7 +229,7 @@ # to override them, then delete the patch file `worker/patch_rejection_sampler.py`. # 2. make these functions as costom op, then remove AscendRejectionSampler # -# ** 11.File: worker/patch_qwen3_next.py** +# ** 10.File: worker/patch_qwen3_next.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet.forward` # Why: @@ -253,7 +241,7 @@ # Future Plan: # Remove this patch when vLLM support these operators. # -# ** 12. File: worker/patch_qwen3_next.py** +# ** 11. File: worker/patch_qwen3_next.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet._forward_core` # Why: diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index 371dd99c..b66e2f4b 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -25,7 +25,6 @@ import vllm_ascend.patch.platform.patch_sched_yield # noqa import vllm_ascend.patch.worker.patch_bert # noqa import vllm_ascend.patch.worker.patch_distributed # noqa import vllm_ascend.patch.worker.patch_deepseek # noqa -import vllm_ascend.patch.worker.patch_weight_loader # noqa import vllm_ascend.patch.worker.patch_multimodal_merge # noqa import vllm_ascend.patch.worker.patch_minicpm # noqa import vllm_ascend.patch.worker.patch_rope # noqa diff --git a/vllm_ascend/patch/worker/patch_weight_loader.py b/vllm_ascend/patch/worker/patch_weight_loader.py deleted file mode 100644 index e0fcde04..00000000 --- a/vllm_ascend/patch/worker/patch_weight_loader.py +++ /dev/null @@ -1,41 +0,0 @@ -import torch -from torch.nn.parameter import Parameter -from vllm.logger import init_logger -from vllm.model_executor.layers.linear import UnquantizedLinearMethod -from vllm.model_executor.utils import set_weight_attrs -from vllm.utils.mem_constants import GiB_bytes - -logger = init_logger(__name__) - - -def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: list[int], input_size: int, - output_size: int, params_dtype: torch.dtype, - **extra_weight_attrs): - # This method creates unquantized linear weights. - # The weights are not quantized, and they are not sharded. - # The amount of memory allocated for the weights is - # sum(output_partition_sizes) * input_size_per_partition. - try: - weight = Parameter(torch.empty(sum(output_partition_sizes), - input_size_per_partition, - dtype=params_dtype), - requires_grad=False) - except torch.cuda.OutOfMemoryError as e: - logger.error("Failed to create unquantized linear weights: %s", e) - if torch.cuda.is_available(): - logger.debug("CUDA device: %s", torch.cuda.current_device()) - logger.debug("Allocated: %.2f GiB", - torch.cuda.memory_allocated() / GiB_bytes) - logger.debug("Reserved: %.2f GiB", - torch.cuda.memory_reserved() / GiB_bytes) - raise RuntimeError( - "Failed to create unquantized linear weights. " - "This may be caused by insufficient memory to allocate " - "the weight.") from e - set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) - layer.register_parameter("weight", weight) - set_weight_attrs(weight, extra_weight_attrs) - - -UnquantizedLinearMethod.create_weights = create_weights