[Misc] Remove useless weight loader patch (#5619)
The patch for weight loader is useless now. Let's remove it
- vLLM version: v0.13.0
- vLLM main:
8be6432bda
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -184,19 +184,7 @@
|
|||||||
# Future Plan:
|
# Future Plan:
|
||||||
# Remove this patch when vLLM support the dispatch function.
|
# Remove this patch when vLLM support the dispatch function.
|
||||||
#
|
#
|
||||||
# ** 7. File: worker/patch_weight_loader.py**
|
# ** 7. File: worker/patch_qwen3_next_mtp.py**
|
||||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
# 1. `vllm.model_executor.layers.linear.UnquantizedLinearMethod`
|
|
||||||
# Why:
|
|
||||||
# vLLM Ascend doesn't work with weight loader v2
|
|
||||||
# How:
|
|
||||||
# patch it to fix the bug.
|
|
||||||
# Related PR (if no, explain why):
|
|
||||||
# This is a bug by Ascend only. We should fix it soon
|
|
||||||
# Future Plan:
|
|
||||||
# Remove this patch when the bug is fixed.
|
|
||||||
#
|
|
||||||
# ** 8. File: worker/patch_qwen3_next_mtp.py**
|
|
||||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
# 1. `vllm.v1.worker.utils.bind_kv_cache`
|
# 1. `vllm.v1.worker.utils.bind_kv_cache`
|
||||||
# Why:
|
# Why:
|
||||||
@@ -209,7 +197,7 @@
|
|||||||
# Future Plan:
|
# Future Plan:
|
||||||
# Remove this patch after discussing with vllm community and adapting bind_kv_cache to npu.
|
# Remove this patch after discussing with vllm community and adapting bind_kv_cache to npu.
|
||||||
#
|
#
|
||||||
# ** 9. File: worker/patch_module.py**
|
# ** 8. File: worker/patch_module.py**
|
||||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
# 1. `vllm.v1.attention.backends.gdn_attn.torch.argsort`
|
# 1. `vllm.v1.attention.backends.gdn_attn.torch.argsort`
|
||||||
# Why:
|
# Why:
|
||||||
@@ -225,7 +213,7 @@
|
|||||||
# Remove this patch when bool is supported in 'torch.argsort' func of npu.
|
# Remove this patch when bool is supported in 'torch.argsort' func of npu.
|
||||||
# Make 'torch.argsort' in `vllm.v1.attention.backends.gdn_attn` be stable.
|
# Make 'torch.argsort' in `vllm.v1.attention.backends.gdn_attn` be stable.
|
||||||
#
|
#
|
||||||
# ** 10. File: worker/patch_rejection_sampler.py**
|
# ** 9. File: worker/patch_rejection_sampler.py**
|
||||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
# 1. `vllm.v1.sample.rejection_sampler`
|
# 1. `vllm.v1.sample.rejection_sampler`
|
||||||
# Why:
|
# Why:
|
||||||
@@ -241,7 +229,7 @@
|
|||||||
# to override them, then delete the patch file `worker/patch_rejection_sampler.py`.
|
# to override them, then delete the patch file `worker/patch_rejection_sampler.py`.
|
||||||
# 2. make these functions as costom op, then remove AscendRejectionSampler
|
# 2. make these functions as costom op, then remove AscendRejectionSampler
|
||||||
#
|
#
|
||||||
# ** 11.File: worker/patch_qwen3_next.py**
|
# ** 10.File: worker/patch_qwen3_next.py**
|
||||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
# 1. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet.forward`
|
# 1. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet.forward`
|
||||||
# Why:
|
# Why:
|
||||||
@@ -253,7 +241,7 @@
|
|||||||
# Future Plan:
|
# Future Plan:
|
||||||
# Remove this patch when vLLM support these operators.
|
# Remove this patch when vLLM support these operators.
|
||||||
#
|
#
|
||||||
# ** 12. File: worker/patch_qwen3_next.py**
|
# ** 11. File: worker/patch_qwen3_next.py**
|
||||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
# 1. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet._forward_core`
|
# 1. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet._forward_core`
|
||||||
# Why:
|
# Why:
|
||||||
|
|||||||
@@ -25,7 +25,6 @@ import vllm_ascend.patch.platform.patch_sched_yield # noqa
|
|||||||
import vllm_ascend.patch.worker.patch_bert # noqa
|
import vllm_ascend.patch.worker.patch_bert # noqa
|
||||||
import vllm_ascend.patch.worker.patch_distributed # noqa
|
import vllm_ascend.patch.worker.patch_distributed # noqa
|
||||||
import vllm_ascend.patch.worker.patch_deepseek # noqa
|
import vllm_ascend.patch.worker.patch_deepseek # noqa
|
||||||
import vllm_ascend.patch.worker.patch_weight_loader # noqa
|
|
||||||
import vllm_ascend.patch.worker.patch_multimodal_merge # noqa
|
import vllm_ascend.patch.worker.patch_multimodal_merge # noqa
|
||||||
import vllm_ascend.patch.worker.patch_minicpm # noqa
|
import vllm_ascend.patch.worker.patch_minicpm # noqa
|
||||||
import vllm_ascend.patch.worker.patch_rope # noqa
|
import vllm_ascend.patch.worker.patch_rope # noqa
|
||||||
|
|||||||
@@ -1,41 +0,0 @@
|
|||||||
import torch
|
|
||||||
from torch.nn.parameter import Parameter
|
|
||||||
from vllm.logger import init_logger
|
|
||||||
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
|
||||||
from vllm.model_executor.utils import set_weight_attrs
|
|
||||||
from vllm.utils.mem_constants import GiB_bytes
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
|
|
||||||
output_partition_sizes: list[int], input_size: int,
|
|
||||||
output_size: int, params_dtype: torch.dtype,
|
|
||||||
**extra_weight_attrs):
|
|
||||||
# This method creates unquantized linear weights.
|
|
||||||
# The weights are not quantized, and they are not sharded.
|
|
||||||
# The amount of memory allocated for the weights is
|
|
||||||
# sum(output_partition_sizes) * input_size_per_partition.
|
|
||||||
try:
|
|
||||||
weight = Parameter(torch.empty(sum(output_partition_sizes),
|
|
||||||
input_size_per_partition,
|
|
||||||
dtype=params_dtype),
|
|
||||||
requires_grad=False)
|
|
||||||
except torch.cuda.OutOfMemoryError as e:
|
|
||||||
logger.error("Failed to create unquantized linear weights: %s", e)
|
|
||||||
if torch.cuda.is_available():
|
|
||||||
logger.debug("CUDA device: %s", torch.cuda.current_device())
|
|
||||||
logger.debug("Allocated: %.2f GiB",
|
|
||||||
torch.cuda.memory_allocated() / GiB_bytes)
|
|
||||||
logger.debug("Reserved: %.2f GiB",
|
|
||||||
torch.cuda.memory_reserved() / GiB_bytes)
|
|
||||||
raise RuntimeError(
|
|
||||||
"Failed to create unquantized linear weights. "
|
|
||||||
"This may be caused by insufficient memory to allocate "
|
|
||||||
"the weight.") from e
|
|
||||||
set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
|
|
||||||
layer.register_parameter("weight", weight)
|
|
||||||
set_weight_attrs(weight, extra_weight_attrs)
|
|
||||||
|
|
||||||
|
|
||||||
UnquantizedLinearMethod.create_weights = create_weights
|
|
||||||
Reference in New Issue
Block a user