[Misc] Remove useless weight loader patch (#5619)

The patch for weight loader is useless now. Let's remove it - vLLM version: v0.13.0 - vLLM main: 8be6432bda Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-01-06 20:17:32 +08:00
parent 089ca2ddcc
commit cd1162e25a
3 changed files with 5 additions and 59 deletions
--- a/vllm_ascend/patch/init.py
+++ b/vllm_ascend/patch/init.py
@@ -184,19 +184,7 @@
 #    Future Plan:
 #       Remove this patch when vLLM support the dispatch function.
 #
-# ** 7. File: worker/patch_weight_loader.py**
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#   1. `vllm.model_executor.layers.linear.UnquantizedLinearMethod`
-#    Why:
-#       vLLM Ascend doesn't work with weight loader v2
-#    How：
-#       patch it to fix the bug.
-#    Related PR (if no, explain why):
-#       This is a bug by Ascend only.  We should fix it soon
-#    Future Plan:
-#       Remove this patch when the bug is fixed.
-#
-# ** 8. File: worker/patch_qwen3_next_mtp.py**
+# ** 7. File: worker/patch_qwen3_next_mtp.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.v1.worker.utils.bind_kv_cache`
 #    Why:
@@ -209,7 +197,7 @@
 #    Future Plan:
 #       Remove this patch after discussing with vllm community and adapting bind_kv_cache to npu.
 #
-# ** 9. File: worker/patch_module.py**
+# ** 8. File: worker/patch_module.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.v1.attention.backends.gdn_attn.torch.argsort`
 #    Why:
@@ -225,7 +213,7 @@
 #       Remove this patch when bool is supported in 'torch.argsort' func of npu.
 #       Make 'torch.argsort' in `vllm.v1.attention.backends.gdn_attn` be stable.
 #
-# ** 10. File: worker/patch_rejection_sampler.py**
+# ** 9. File: worker/patch_rejection_sampler.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.v1.sample.rejection_sampler`
 #    Why:
@@ -241,7 +229,7 @@
 #           to override them, then delete the patch file `worker/patch_rejection_sampler.py`.
 #       2. make these functions as costom op, then remove AscendRejectionSampler
 #
-# ** 11.File: worker/patch_qwen3_next.py**
+# ** 10.File: worker/patch_qwen3_next.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet.forward`
 #    Why:
@@ -253,7 +241,7 @@
 #    Future Plan:
 #       Remove this patch when vLLM support these operators.
 #
-# ** 12. File: worker/patch_qwen3_next.py**
+# ** 11. File: worker/patch_qwen3_next.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet._forward_core`
 #    Why:
--- a/vllm_ascend/patch/worker/init.py
+++ b/vllm_ascend/patch/worker/init.py
@@ -25,7 +25,6 @@ import vllm_ascend.patch.platform.patch_sched_yield  # noqa
 import vllm_ascend.patch.worker.patch_bert  # noqa
 import vllm_ascend.patch.worker.patch_distributed  # noqa
 import vllm_ascend.patch.worker.patch_deepseek  # noqa
-import vllm_ascend.patch.worker.patch_weight_loader  # noqa
 import vllm_ascend.patch.worker.patch_multimodal_merge  # noqa
 import vllm_ascend.patch.worker.patch_minicpm  # noqa
 import vllm_ascend.patch.worker.patch_rope  # noqa
--- a/vllm_ascend/patch/worker/patch_weight_loader.py
+++ b/vllm_ascend/patch/worker/patch_weight_loader.py
@@ -1,41 +0,0 @@
-import torch
-from torch.nn.parameter import Parameter
-from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import UnquantizedLinearMethod
-from vllm.model_executor.utils import set_weight_attrs
-from vllm.utils.mem_constants import GiB_bytes
-
-logger = init_logger(__name__)
-
-
-def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
-                   output_partition_sizes: list[int], input_size: int,
-                   output_size: int, params_dtype: torch.dtype,
-                   **extra_weight_attrs):
-    # This method creates unquantized linear weights.
-    # The weights are not quantized, and they are not sharded.
-    # The amount of memory allocated for the weights is
-    # sum(output_partition_sizes) * input_size_per_partition.
-    try:
-        weight = Parameter(torch.empty(sum(output_partition_sizes),
-                                       input_size_per_partition,
-                                       dtype=params_dtype),
-                           requires_grad=False)
-    except torch.cuda.OutOfMemoryError as e:
-        logger.error("Failed to create unquantized linear weights: %s", e)
-        if torch.cuda.is_available():
-            logger.debug("CUDA device: %s", torch.cuda.current_device())
-            logger.debug("Allocated: %.2f GiB",
-                         torch.cuda.memory_allocated() / GiB_bytes)
-            logger.debug("Reserved: %.2f GiB",
-                         torch.cuda.memory_reserved() / GiB_bytes)
-        raise RuntimeError(
-            "Failed to create unquantized linear weights. "
-            "This may be caused by insufficient memory to allocate "
-            "the weight.") from e
-    set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
-    layer.register_parameter("weight", weight)
-    set_weight_attrs(weight, extra_weight_attrs)
-
-
-UnquantizedLinearMethod.create_weights = create_weights