### What this PR does / why we need it? Upgrade to new vllm commit:c9461e05a4- Fix many imports, caused by https://github.com/vllm-project/vllm/pull/26908 - Fix import ```sha256```, caused by https://github.com/vllm-project/vllm/pull/27169 - Remove ```SchedulerConfig.send_delta_data```, caused by https://github.com/vllm-project/vllm/pull/27142 - Fix ```FusedMoE``` because of dual stream execution, caused by https://github.com/vllm-project/vllm/pull/26440 ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with new added/existing test. - vLLM version: v0.11.0rc3 - vLLM main:17c540a993--------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: MengqingCao <cmq0113@163.com>
48 lines
2.0 KiB
Python
48 lines
2.0 KiB
Python
import torch
|
|
from torch.nn.parameter import Parameter
|
|
from vllm.logger import init_logger
|
|
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
|
from vllm.model_executor.utils import set_weight_attrs
|
|
|
|
from vllm_ascend.utils import vllm_version_is
|
|
|
|
if vllm_version_is("0.11.0"):
|
|
from vllm.utils import GiB_bytes
|
|
else:
|
|
from vllm.utils.mem_constants import GiB_bytes
|
|
|
|
logger = init_logger(__name__)
|
|
|
|
|
|
def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
|
|
output_partition_sizes: list[int], input_size: int,
|
|
output_size: int, params_dtype: torch.dtype,
|
|
**extra_weight_attrs):
|
|
# This method creates unquantized linear weights.
|
|
# The weights are not quantized, and they are not sharded.
|
|
# The amount of memory allocated for the weights is
|
|
# sum(output_partition_sizes) * input_size_per_partition.
|
|
try:
|
|
weight = Parameter(torch.empty(sum(output_partition_sizes),
|
|
input_size_per_partition,
|
|
dtype=params_dtype),
|
|
requires_grad=False)
|
|
except torch.cuda.OutOfMemoryError as e:
|
|
logger.error("Failed to create unquantized linear weights: %s", e)
|
|
if torch.cuda.is_available():
|
|
logger.debug("CUDA device: %s", torch.cuda.current_device())
|
|
logger.debug("Allocated: %.2f GiB",
|
|
torch.cuda.memory_allocated() / GiB_bytes)
|
|
logger.debug("Reserved: %.2f GiB",
|
|
torch.cuda.memory_reserved() / GiB_bytes)
|
|
raise RuntimeError(
|
|
"Failed to create unquantized linear weights. "
|
|
"This may be caused by insufficient memory to allocate "
|
|
"the weight.") from e
|
|
set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
|
|
layer.register_parameter("weight", weight)
|
|
set_weight_attrs(weight, extra_weight_attrs)
|
|
|
|
|
|
UnquantizedLinearMethod.create_weights = create_weights
|