[CI] Upgrade vllm to newest commit (#3182)

### What this PR does / why we need it?
Upgrade vLLM to newest commit

- Fix the aclgraph doesn't work problem, caused by
24fab45d96
- Fix PoolerOutput import error, caused by
755ed7b05b
- Fix the aclgraph weight load error to keep the same with torchair fix.
4492e3a554

### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
All test should pass


- vLLM version: v0.10.2
- vLLM main:
52d0cb8458

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-09-26 06:18:15 +08:00
committed by GitHub
parent 0794f64a18
commit 2930e4a6bd
9 changed files with 49 additions and 53 deletions

View File

@@ -36,7 +36,7 @@ jobs:
- name: Get vLLM version - name: Get vLLM version
run: | run: |
VLLM_COMMIT=52d0cb845866869d587fc013a7c59e60a86ebcf2 VLLM_COMMIT=17b4c6685ce62d5652654784d6771a3d38e4273e
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
- name: Checkout repository - name: Checkout repository

View File

@@ -42,7 +42,7 @@ jobs:
lint: lint:
uses: ./.github/workflows/pre-commit.yml uses: ./.github/workflows/pre-commit.yml
with: with:
vllm: 52d0cb845866869d587fc013a7c59e60a86ebcf2 vllm: 17b4c6685ce62d5652654784d6771a3d38e4273e
changes: changes:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@@ -83,7 +83,7 @@ jobs:
VLLM_USE_MODELSCOPE: True VLLM_USE_MODELSCOPE: True
strategy: strategy:
matrix: matrix:
vllm_version: [52d0cb845866869d587fc013a7c59e60a86ebcf2, v0.10.2] vllm_version: [17b4c6685ce62d5652654784d6771a3d38e4273e, v0.10.2]
steps: steps:
- name: Install packages - name: Install packages
run: | run: |
@@ -138,7 +138,7 @@ jobs:
name: e2e-light name: e2e-light
strategy: strategy:
matrix: matrix:
vllm_version: [52d0cb845866869d587fc013a7c59e60a86ebcf2, v0.10.2] vllm_version: [17b4c6685ce62d5652654784d6771a3d38e4273e, v0.10.2]
# Note (yikun): If CI resource are limited we can split job into two chain jobs # Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes] needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request. # only trigger e2e test after lint passed and the change is e2e related with pull request.

View File

@@ -68,7 +68,7 @@ jobs:
name: e2e-full name: e2e-full
strategy: strategy:
matrix: matrix:
vllm_version: [52d0cb845866869d587fc013a7c59e60a86ebcf2, v0.10.2] vllm_version: [17b4c6685ce62d5652654784d6771a3d38e4273e, v0.10.2]
needs: [changes] needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml uses: ./.github/workflows/_e2e_test.yaml

View File

@@ -1,9 +1,6 @@
import torch import torch
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from vllm.logger import init_logger from vllm.logger import init_logger
# yapf: disable
from vllm.model_executor.parameter import ModelWeightParameter
# yapf: enable
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
from vllm.utils import GiB_bytes from vllm.utils import GiB_bytes
@@ -16,27 +13,15 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
output_partition_sizes: list[int], input_size: int, output_partition_sizes: list[int], input_size: int,
output_size: int, params_dtype: torch.dtype, output_size: int, params_dtype: torch.dtype,
**extra_weight_attrs): **extra_weight_attrs):
from vllm_ascend.ascend_config import get_ascend_config
ascend_config = get_ascend_config()
# This method creates unquantized linear weights. # This method creates unquantized linear weights.
# The weights are not quantized, and they are not sharded. # The weights are not quantized, and they are not sharded.
# The amount of memory allocated for the weights is # The amount of memory allocated for the weights is
# sum(output_partition_sizes) * input_size_per_partition. # sum(output_partition_sizes) * input_size_per_partition.
try: try:
if ascend_config.torchair_graph_config.enabled: weight = Parameter(torch.empty(sum(output_partition_sizes),
weight = Parameter(torch.empty(sum(output_partition_sizes), input_size_per_partition,
input_size_per_partition, dtype=params_dtype),
dtype=params_dtype), requires_grad=False)
requires_grad=False)
else:
weight_loader = extra_weight_attrs.pop("weight_loader")
weight = ModelWeightParameter(data=torch.empty(
sum(output_partition_sizes),
input_size_per_partition,
dtype=params_dtype),
input_dim=1,
output_dim=0,
weight_loader=weight_loader)
except torch.cuda.OutOfMemoryError as e: except torch.cuda.OutOfMemoryError as e:
logger.error("Failed to create unquantized linear weights: %s", e) logger.error("Failed to create unquantized linear weights: %s", e)
if torch.cuda.is_available(): if torch.cuda.is_available():
@@ -49,8 +34,7 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
"Failed to create unquantized linear weights. " "Failed to create unquantized linear weights. "
"This may be caused by insufficient memory to allocate " "This may be caused by insufficient memory to allocate "
"the weight.") from e "the weight.") from e
if ascend_config.torchair_graph_config.enabled: set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
layer.register_parameter("weight", weight) layer.register_parameter("weight", weight)
set_weight_attrs(weight, extra_weight_attrs) set_weight_attrs(weight, extra_weight_attrs)

View File

@@ -209,6 +209,11 @@ class NPUPlatform(Platform):
# set cudaprah sizes before extending `compilation_config.splitting_ops` # set cudaprah sizes before extending `compilation_config.splitting_ops`
vllm_config._set_cudagraph_sizes() vllm_config._set_cudagraph_sizes()
# TODO: Full graph is fully supported later, and the default value will be set to full graph.
if not vllm_version_is("v0.10.2"):
if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
if compilation_config.cudagraph_mode == CUDAGraphMode.NONE: if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
compilation_config.level = CompilationLevel.NO_COMPILATION compilation_config.level = CompilationLevel.NO_COMPILATION
# TODO: Currently MLA does not support FULL_DECODE_ONLY, remove the second condition # TODO: Currently MLA does not support FULL_DECODE_ONLY, remove the second condition

View File

@@ -33,7 +33,6 @@ from vllm.model_executor.layers.quantization.base_config import (
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
UnquantizedEmbeddingMethod, VocabParallelEmbedding) UnquantizedEmbeddingMethod, VocabParallelEmbedding)
from vllm.model_executor.parameter import PerTensorScaleParameter
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
from vllm_ascend.distributed.parallel_state import (get_mlp_tp_group, from vllm_ascend.distributed.parallel_state import (get_mlp_tp_group,
@@ -251,7 +250,6 @@ class AscendLinearMethod(LinearMethodBase):
**extra_weight_attrs, **extra_weight_attrs,
) -> None: ) -> None:
output_size_per_partition = sum(output_partition_sizes) output_size_per_partition = sum(output_partition_sizes)
weight_loader = extra_weight_attrs.get("weight_loader")
weight_dict = self.quant_method.get_weight(input_size_per_partition, weight_dict = self.quant_method.get_weight(input_size_per_partition,
output_size_per_partition, output_size_per_partition,
@@ -264,8 +262,7 @@ class AscendLinearMethod(LinearMethodBase):
pertensor_dict = self.quant_method.get_pertensor_param(params_dtype) pertensor_dict = self.quant_method.get_pertensor_param(params_dtype)
for pertensor_name, pertensor_param in pertensor_dict.items(): for pertensor_name, pertensor_param in pertensor_dict.items():
param = PerTensorScaleParameter(data=pertensor_param, param = torch.nn.Parameter(pertensor_param, requires_grad=False)
weight_loader=weight_loader)
# disable warning # disable warning
param.ignore_warning = True param.ignore_warning = True
layer.register_parameter(pertensor_name, param) layer.register_parameter(pertensor_name, param)

View File

@@ -28,20 +28,6 @@ from vllm_ascend.worker.worker_v1 import NPUWorker
class NPUTorchairWorker(NPUWorker): class NPUTorchairWorker(NPUWorker):
"""Torchair worker bases on NPUWorker. Only torchair specified code should be added in this class.""" """Torchair worker bases on NPUWorker. Only torchair specified code should be added in this class."""
def __init__(self,
vllm_config,
local_rank,
rank,
distributed_init_method,
is_driver_worker=False,
**kwargs):
super().__init__(vllm_config, local_rank, rank,
distributed_init_method, is_driver_worker, **kwargs)
from vllm.model_executor.layers.linear import \
WEIGHT_LOADER_V2_SUPPORTED
if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED:
WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")
def determine_available_memory(self) -> int: def determine_available_memory(self) -> int:
"""Override determine_available_memory to use cached torchair kv_cache_bytes.""" """Override determine_available_memory to use cached torchair kv_cache_bytes."""

View File

@@ -64,11 +64,12 @@ from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.multimodal.utils import group_mm_kwargs_by_modality
from vllm.pooling_params import PoolingParams from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingType from vllm.sampling_params import SamplingType
from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.sequence import IntermediateTensors
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
LazyLoader, cdiv, get_dtype_size, LazyLoader, cdiv, get_dtype_size,
is_pin_memory_available) is_pin_memory_available)
from vllm.utils.jsontree import json_map_leaves
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
from vllm.v1.attention.backends.utils import ( from vllm.v1.attention.backends.utils import (
AttentionCGSupport, reorder_batch_to_split_decodes_and_prefills) AttentionCGSupport, reorder_batch_to_split_decodes_and_prefills)
@@ -144,7 +145,9 @@ else:
if not vllm_version_is("0.10.2"): if not vllm_version_is("0.10.2"):
from vllm.v1.kv_cache_interface import UniformTypeKVCacheSpecs from vllm.v1.kv_cache_interface import UniformTypeKVCacheSpecs
from vllm.v1.outputs import PoolerOutput
else: else:
from vllm.sequence import PoolerOutput
UniformTypeKVCacheSpecs = None UniformTypeKVCacheSpecs = None
@@ -1806,18 +1809,30 @@ class NPUModelRunner(LoRAModelRunnerMixin):
device=hidden_states.device) device=hidden_states.device)
seq_lens_cpu = self.seq_lens_cpu[:self.input_batch.num_reqs] seq_lens_cpu = self.seq_lens_cpu[:self.input_batch.num_reqs]
# Pooling models D2H & synchronize occurs in pooler.py:build_output if vllm_version_is("0.10.2"):
raw_pooler_output = self.model.pooler( # Pooling models D2H & synchronize occurs in pooler.py:build_output
hidden_states=hidden_states, pooling_metadata=pooling_metadata) raw_pooler_output = self.model.pooler(
hidden_states=hidden_states, pooling_metadata=pooling_metadata)
else:
model = cast(VllmModelForPooling, self.model)
raw_pooler_output = model.pooler(
hidden_states=hidden_states,
pooling_metadata=pooling_metadata,
)
raw_pooler_output = json_map_leaves(
lambda x: x.to("cpu", non_blocking=True),
raw_pooler_output,
)
torch.npu.synchronize()
pooler_output: list[Optional[torch.Tensor]] = [] pooler_output: list[Optional[torch.Tensor]] = []
for raw_output, seq_len, prompt_len in zip( for raw_output, seq_len, prompt_len in zip(
raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens): raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens):
if vllm_version_is("0.10.2"):
if seq_len == prompt_len: output = raw_output.data if seq_len == prompt_len else None
pooler_output.append(raw_output.data)
else: else:
pooler_output.append(None) output = raw_output if seq_len == prompt_len else None
pooler_output.append(output)
return ModelRunnerOutput( return ModelRunnerOutput(
req_ids=self.input_batch.req_ids, req_ids=self.input_batch.req_ids,
@@ -2582,7 +2597,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
for task in self.get_supported_pooling_tasks(): for task in self.get_supported_pooling_tasks():
# Run a full batch with each task to ensure none of them OOMs # Run a full batch with each task to ensure none of them OOMs
output = self._dummy_pooler_run_task(hidden_states, task) output = self._dummy_pooler_run_task(hidden_states, task)
output_size[task] = output.get_data_nbytes() if vllm_version_is("0.10.2"):
output_size[task] = output.get_data_nbytes()
else:
output_size[task] = sum(o.nbytes for o in output)
del output # Allow GC del output # Allow GC
max_task = max(output_size.items(), key=lambda x: x[1])[0] max_task = max(output_size.items(), key=lambda x: x[1])[0]

View File

@@ -116,6 +116,12 @@ class NPUWorker(WorkerBase):
# Buffers saved before sleep # Buffers saved before sleep
self._sleep_saved_buffers: dict[str, torch.Tensor] = {} self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
# FixMe: this is a patch to fix the issue cause by https://github.com/vllm-project/vllm/commit/de94289a98d7ec52a5ef02719e01a1db8b505170
from vllm.model_executor.layers.linear import \
WEIGHT_LOADER_V2_SUPPORTED
if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED:
WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")
def sleep(self, level: int = 1) -> None: def sleep(self, level: int = 1) -> None:
if not sleep_mode_enabled(): if not sleep_mode_enabled():
raise ValueError( raise ValueError(