[CI] Upgrade vllm to newest commit (#3182)
### What this PR does / why we need it? Upgrade vLLM to newest commit - Fix the aclgraph doesn't work problem, caused by24fab45d96- Fix PoolerOutput import error, caused by755ed7b05b- Fix the aclgraph weight load error to keep the same with torchair fix.4492e3a554### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? All test should pass - vLLM version: v0.10.2 - vLLM main:52d0cb8458--------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
2
.github/workflows/format_pr_body.yaml
vendored
2
.github/workflows/format_pr_body.yaml
vendored
@@ -36,7 +36,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Get vLLM version
|
- name: Get vLLM version
|
||||||
run: |
|
run: |
|
||||||
VLLM_COMMIT=52d0cb845866869d587fc013a7c59e60a86ebcf2
|
VLLM_COMMIT=17b4c6685ce62d5652654784d6771a3d38e4273e
|
||||||
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
|
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
|
||||||
|
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
|
|||||||
6
.github/workflows/vllm_ascend_test.yaml
vendored
6
.github/workflows/vllm_ascend_test.yaml
vendored
@@ -42,7 +42,7 @@ jobs:
|
|||||||
lint:
|
lint:
|
||||||
uses: ./.github/workflows/pre-commit.yml
|
uses: ./.github/workflows/pre-commit.yml
|
||||||
with:
|
with:
|
||||||
vllm: 52d0cb845866869d587fc013a7c59e60a86ebcf2
|
vllm: 17b4c6685ce62d5652654784d6771a3d38e4273e
|
||||||
|
|
||||||
changes:
|
changes:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@@ -83,7 +83,7 @@ jobs:
|
|||||||
VLLM_USE_MODELSCOPE: True
|
VLLM_USE_MODELSCOPE: True
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [52d0cb845866869d587fc013a7c59e60a86ebcf2, v0.10.2]
|
vllm_version: [17b4c6685ce62d5652654784d6771a3d38e4273e, v0.10.2]
|
||||||
steps:
|
steps:
|
||||||
- name: Install packages
|
- name: Install packages
|
||||||
run: |
|
run: |
|
||||||
@@ -138,7 +138,7 @@ jobs:
|
|||||||
name: e2e-light
|
name: e2e-light
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [52d0cb845866869d587fc013a7c59e60a86ebcf2, v0.10.2]
|
vllm_version: [17b4c6685ce62d5652654784d6771a3d38e4273e, v0.10.2]
|
||||||
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
||||||
needs: [lint, changes]
|
needs: [lint, changes]
|
||||||
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
||||||
|
|||||||
2
.github/workflows/vllm_ascend_test_full.yaml
vendored
2
.github/workflows/vllm_ascend_test_full.yaml
vendored
@@ -68,7 +68,7 @@ jobs:
|
|||||||
name: e2e-full
|
name: e2e-full
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [52d0cb845866869d587fc013a7c59e60a86ebcf2, v0.10.2]
|
vllm_version: [17b4c6685ce62d5652654784d6771a3d38e4273e, v0.10.2]
|
||||||
needs: [changes]
|
needs: [changes]
|
||||||
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
|
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
|
||||||
uses: ./.github/workflows/_e2e_test.yaml
|
uses: ./.github/workflows/_e2e_test.yaml
|
||||||
|
|||||||
@@ -1,9 +1,6 @@
|
|||||||
import torch
|
import torch
|
||||||
from torch.nn.parameter import Parameter
|
from torch.nn.parameter import Parameter
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
# yapf: disable
|
|
||||||
from vllm.model_executor.parameter import ModelWeightParameter
|
|
||||||
# yapf: enable
|
|
||||||
from vllm.model_executor.utils import set_weight_attrs
|
from vllm.model_executor.utils import set_weight_attrs
|
||||||
from vllm.utils import GiB_bytes
|
from vllm.utils import GiB_bytes
|
||||||
|
|
||||||
@@ -16,27 +13,15 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
|
|||||||
output_partition_sizes: list[int], input_size: int,
|
output_partition_sizes: list[int], input_size: int,
|
||||||
output_size: int, params_dtype: torch.dtype,
|
output_size: int, params_dtype: torch.dtype,
|
||||||
**extra_weight_attrs):
|
**extra_weight_attrs):
|
||||||
from vllm_ascend.ascend_config import get_ascend_config
|
|
||||||
ascend_config = get_ascend_config()
|
|
||||||
# This method creates unquantized linear weights.
|
# This method creates unquantized linear weights.
|
||||||
# The weights are not quantized, and they are not sharded.
|
# The weights are not quantized, and they are not sharded.
|
||||||
# The amount of memory allocated for the weights is
|
# The amount of memory allocated for the weights is
|
||||||
# sum(output_partition_sizes) * input_size_per_partition.
|
# sum(output_partition_sizes) * input_size_per_partition.
|
||||||
try:
|
try:
|
||||||
if ascend_config.torchair_graph_config.enabled:
|
weight = Parameter(torch.empty(sum(output_partition_sizes),
|
||||||
weight = Parameter(torch.empty(sum(output_partition_sizes),
|
input_size_per_partition,
|
||||||
input_size_per_partition,
|
dtype=params_dtype),
|
||||||
dtype=params_dtype),
|
requires_grad=False)
|
||||||
requires_grad=False)
|
|
||||||
else:
|
|
||||||
weight_loader = extra_weight_attrs.pop("weight_loader")
|
|
||||||
weight = ModelWeightParameter(data=torch.empty(
|
|
||||||
sum(output_partition_sizes),
|
|
||||||
input_size_per_partition,
|
|
||||||
dtype=params_dtype),
|
|
||||||
input_dim=1,
|
|
||||||
output_dim=0,
|
|
||||||
weight_loader=weight_loader)
|
|
||||||
except torch.cuda.OutOfMemoryError as e:
|
except torch.cuda.OutOfMemoryError as e:
|
||||||
logger.error("Failed to create unquantized linear weights: %s", e)
|
logger.error("Failed to create unquantized linear weights: %s", e)
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
@@ -49,8 +34,7 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
|
|||||||
"Failed to create unquantized linear weights. "
|
"Failed to create unquantized linear weights. "
|
||||||
"This may be caused by insufficient memory to allocate "
|
"This may be caused by insufficient memory to allocate "
|
||||||
"the weight.") from e
|
"the weight.") from e
|
||||||
if ascend_config.torchair_graph_config.enabled:
|
set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
|
||||||
set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
|
|
||||||
layer.register_parameter("weight", weight)
|
layer.register_parameter("weight", weight)
|
||||||
set_weight_attrs(weight, extra_weight_attrs)
|
set_weight_attrs(weight, extra_weight_attrs)
|
||||||
|
|
||||||
|
|||||||
@@ -209,6 +209,11 @@ class NPUPlatform(Platform):
|
|||||||
# set cudaprah sizes before extending `compilation_config.splitting_ops`
|
# set cudaprah sizes before extending `compilation_config.splitting_ops`
|
||||||
vllm_config._set_cudagraph_sizes()
|
vllm_config._set_cudagraph_sizes()
|
||||||
|
|
||||||
|
# TODO: Full graph is fully supported later, and the default value will be set to full graph.
|
||||||
|
if not vllm_version_is("v0.10.2"):
|
||||||
|
if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
|
||||||
|
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||||
|
|
||||||
if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
|
if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
|
||||||
compilation_config.level = CompilationLevel.NO_COMPILATION
|
compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||||
# TODO: Currently MLA does not support FULL_DECODE_ONLY, remove the second condition
|
# TODO: Currently MLA does not support FULL_DECODE_ONLY, remove the second condition
|
||||||
|
|||||||
@@ -33,7 +33,6 @@ from vllm.model_executor.layers.quantization.base_config import (
|
|||||||
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
|
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||||
UnquantizedEmbeddingMethod, VocabParallelEmbedding)
|
UnquantizedEmbeddingMethod, VocabParallelEmbedding)
|
||||||
from vllm.model_executor.parameter import PerTensorScaleParameter
|
|
||||||
from vllm.model_executor.utils import set_weight_attrs
|
from vllm.model_executor.utils import set_weight_attrs
|
||||||
|
|
||||||
from vllm_ascend.distributed.parallel_state import (get_mlp_tp_group,
|
from vllm_ascend.distributed.parallel_state import (get_mlp_tp_group,
|
||||||
@@ -251,7 +250,6 @@ class AscendLinearMethod(LinearMethodBase):
|
|||||||
**extra_weight_attrs,
|
**extra_weight_attrs,
|
||||||
) -> None:
|
) -> None:
|
||||||
output_size_per_partition = sum(output_partition_sizes)
|
output_size_per_partition = sum(output_partition_sizes)
|
||||||
weight_loader = extra_weight_attrs.get("weight_loader")
|
|
||||||
|
|
||||||
weight_dict = self.quant_method.get_weight(input_size_per_partition,
|
weight_dict = self.quant_method.get_weight(input_size_per_partition,
|
||||||
output_size_per_partition,
|
output_size_per_partition,
|
||||||
@@ -264,8 +262,7 @@ class AscendLinearMethod(LinearMethodBase):
|
|||||||
|
|
||||||
pertensor_dict = self.quant_method.get_pertensor_param(params_dtype)
|
pertensor_dict = self.quant_method.get_pertensor_param(params_dtype)
|
||||||
for pertensor_name, pertensor_param in pertensor_dict.items():
|
for pertensor_name, pertensor_param in pertensor_dict.items():
|
||||||
param = PerTensorScaleParameter(data=pertensor_param,
|
param = torch.nn.Parameter(pertensor_param, requires_grad=False)
|
||||||
weight_loader=weight_loader)
|
|
||||||
# disable warning
|
# disable warning
|
||||||
param.ignore_warning = True
|
param.ignore_warning = True
|
||||||
layer.register_parameter(pertensor_name, param)
|
layer.register_parameter(pertensor_name, param)
|
||||||
|
|||||||
@@ -28,20 +28,6 @@ from vllm_ascend.worker.worker_v1 import NPUWorker
|
|||||||
class NPUTorchairWorker(NPUWorker):
|
class NPUTorchairWorker(NPUWorker):
|
||||||
"""Torchair worker bases on NPUWorker. Only torchair specified code should be added in this class."""
|
"""Torchair worker bases on NPUWorker. Only torchair specified code should be added in this class."""
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
vllm_config,
|
|
||||||
local_rank,
|
|
||||||
rank,
|
|
||||||
distributed_init_method,
|
|
||||||
is_driver_worker=False,
|
|
||||||
**kwargs):
|
|
||||||
super().__init__(vllm_config, local_rank, rank,
|
|
||||||
distributed_init_method, is_driver_worker, **kwargs)
|
|
||||||
from vllm.model_executor.layers.linear import \
|
|
||||||
WEIGHT_LOADER_V2_SUPPORTED
|
|
||||||
if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED:
|
|
||||||
WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")
|
|
||||||
|
|
||||||
def determine_available_memory(self) -> int:
|
def determine_available_memory(self) -> int:
|
||||||
"""Override determine_available_memory to use cached torchair kv_cache_bytes."""
|
"""Override determine_available_memory to use cached torchair kv_cache_bytes."""
|
||||||
|
|
||||||
|
|||||||
@@ -64,11 +64,12 @@ from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
|
|||||||
from vllm.multimodal.utils import group_mm_kwargs_by_modality
|
from vllm.multimodal.utils import group_mm_kwargs_by_modality
|
||||||
from vllm.pooling_params import PoolingParams
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.sampling_params import SamplingType
|
from vllm.sampling_params import SamplingType
|
||||||
from vllm.sequence import IntermediateTensors, PoolerOutput
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
|
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
|
||||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
|
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
|
||||||
LazyLoader, cdiv, get_dtype_size,
|
LazyLoader, cdiv, get_dtype_size,
|
||||||
is_pin_memory_available)
|
is_pin_memory_available)
|
||||||
|
from vllm.utils.jsontree import json_map_leaves
|
||||||
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
|
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
|
||||||
from vllm.v1.attention.backends.utils import (
|
from vllm.v1.attention.backends.utils import (
|
||||||
AttentionCGSupport, reorder_batch_to_split_decodes_and_prefills)
|
AttentionCGSupport, reorder_batch_to_split_decodes_and_prefills)
|
||||||
@@ -144,7 +145,9 @@ else:
|
|||||||
|
|
||||||
if not vllm_version_is("0.10.2"):
|
if not vllm_version_is("0.10.2"):
|
||||||
from vllm.v1.kv_cache_interface import UniformTypeKVCacheSpecs
|
from vllm.v1.kv_cache_interface import UniformTypeKVCacheSpecs
|
||||||
|
from vllm.v1.outputs import PoolerOutput
|
||||||
else:
|
else:
|
||||||
|
from vllm.sequence import PoolerOutput
|
||||||
UniformTypeKVCacheSpecs = None
|
UniformTypeKVCacheSpecs = None
|
||||||
|
|
||||||
|
|
||||||
@@ -1806,18 +1809,30 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
device=hidden_states.device)
|
device=hidden_states.device)
|
||||||
seq_lens_cpu = self.seq_lens_cpu[:self.input_batch.num_reqs]
|
seq_lens_cpu = self.seq_lens_cpu[:self.input_batch.num_reqs]
|
||||||
|
|
||||||
# Pooling models D2H & synchronize occurs in pooler.py:build_output
|
if vllm_version_is("0.10.2"):
|
||||||
raw_pooler_output = self.model.pooler(
|
# Pooling models D2H & synchronize occurs in pooler.py:build_output
|
||||||
hidden_states=hidden_states, pooling_metadata=pooling_metadata)
|
raw_pooler_output = self.model.pooler(
|
||||||
|
hidden_states=hidden_states, pooling_metadata=pooling_metadata)
|
||||||
|
else:
|
||||||
|
model = cast(VllmModelForPooling, self.model)
|
||||||
|
raw_pooler_output = model.pooler(
|
||||||
|
hidden_states=hidden_states,
|
||||||
|
pooling_metadata=pooling_metadata,
|
||||||
|
)
|
||||||
|
raw_pooler_output = json_map_leaves(
|
||||||
|
lambda x: x.to("cpu", non_blocking=True),
|
||||||
|
raw_pooler_output,
|
||||||
|
)
|
||||||
|
torch.npu.synchronize()
|
||||||
|
|
||||||
pooler_output: list[Optional[torch.Tensor]] = []
|
pooler_output: list[Optional[torch.Tensor]] = []
|
||||||
for raw_output, seq_len, prompt_len in zip(
|
for raw_output, seq_len, prompt_len in zip(
|
||||||
raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens):
|
raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens):
|
||||||
|
if vllm_version_is("0.10.2"):
|
||||||
if seq_len == prompt_len:
|
output = raw_output.data if seq_len == prompt_len else None
|
||||||
pooler_output.append(raw_output.data)
|
|
||||||
else:
|
else:
|
||||||
pooler_output.append(None)
|
output = raw_output if seq_len == prompt_len else None
|
||||||
|
pooler_output.append(output)
|
||||||
|
|
||||||
return ModelRunnerOutput(
|
return ModelRunnerOutput(
|
||||||
req_ids=self.input_batch.req_ids,
|
req_ids=self.input_batch.req_ids,
|
||||||
@@ -2582,7 +2597,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
for task in self.get_supported_pooling_tasks():
|
for task in self.get_supported_pooling_tasks():
|
||||||
# Run a full batch with each task to ensure none of them OOMs
|
# Run a full batch with each task to ensure none of them OOMs
|
||||||
output = self._dummy_pooler_run_task(hidden_states, task)
|
output = self._dummy_pooler_run_task(hidden_states, task)
|
||||||
output_size[task] = output.get_data_nbytes()
|
if vllm_version_is("0.10.2"):
|
||||||
|
output_size[task] = output.get_data_nbytes()
|
||||||
|
else:
|
||||||
|
output_size[task] = sum(o.nbytes for o in output)
|
||||||
del output # Allow GC
|
del output # Allow GC
|
||||||
|
|
||||||
max_task = max(output_size.items(), key=lambda x: x[1])[0]
|
max_task = max(output_size.items(), key=lambda x: x[1])[0]
|
||||||
|
|||||||
@@ -116,6 +116,12 @@ class NPUWorker(WorkerBase):
|
|||||||
# Buffers saved before sleep
|
# Buffers saved before sleep
|
||||||
self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
|
self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
|
||||||
|
|
||||||
|
# FixMe: this is a patch to fix the issue cause by https://github.com/vllm-project/vllm/commit/de94289a98d7ec52a5ef02719e01a1db8b505170
|
||||||
|
from vllm.model_executor.layers.linear import \
|
||||||
|
WEIGHT_LOADER_V2_SUPPORTED
|
||||||
|
if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED:
|
||||||
|
WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")
|
||||||
|
|
||||||
def sleep(self, level: int = 1) -> None:
|
def sleep(self, level: int = 1) -> None:
|
||||||
if not sleep_mode_enabled():
|
if not sleep_mode_enabled():
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
|||||||
Reference in New Issue
Block a user