From 2930e4a6bdcc6405c9924c35d70c5bc00f844b04 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Fri, 26 Sep 2025 06:18:15 +0800 Subject: [PATCH] [CI] Upgrade vllm to newest commit (#3182) ### What this PR does / why we need it? Upgrade vLLM to newest commit - Fix the aclgraph doesn't work problem, caused by https://github.com/vllm-project/vllm/commit/24fab45d96a91b491db338ee02cd24e55b7fbb5f - Fix PoolerOutput import error, caused by https://github.com/vllm-project/vllm/commit/755ed7b05be4743237d3339c4ff8c22bcaae04f4 - Fix the aclgraph weight load error to keep the same with torchair fix. https://github.com/vllm-project/vllm/commit/4492e3a55428e161ca8db381edc28263e5da4c8d ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? All test should pass - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/52d0cb845866869d587fc013a7c59e60a86ebcf2 --------- Signed-off-by: wangxiyuan --- .github/workflows/format_pr_body.yaml | 2 +- .github/workflows/vllm_ascend_test.yaml | 6 ++-- .github/workflows/vllm_ascend_test_full.yaml | 2 +- .../patch_common/patch_weight_loader.py | 26 +++----------- vllm_ascend/platform.py | 5 +++ vllm_ascend/quantization/quant_config.py | 5 +-- vllm_ascend/torchair/torchair_worker.py | 14 -------- vllm_ascend/worker/model_runner_v1.py | 36 ++++++++++++++----- vllm_ascend/worker/worker_v1.py | 6 ++++ 9 files changed, 49 insertions(+), 53 deletions(-) diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml index 7fc23ee..a99d812 100644 --- a/.github/workflows/format_pr_body.yaml +++ b/.github/workflows/format_pr_body.yaml @@ -36,7 +36,7 @@ jobs: - name: Get vLLM version run: | - VLLM_COMMIT=52d0cb845866869d587fc013a7c59e60a86ebcf2 + VLLM_COMMIT=17b4c6685ce62d5652654784d6771a3d38e4273e echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV - name: Checkout repository diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index d1e1af5..a45d676 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -42,7 +42,7 @@ jobs: lint: uses: ./.github/workflows/pre-commit.yml with: - vllm: 52d0cb845866869d587fc013a7c59e60a86ebcf2 + vllm: 17b4c6685ce62d5652654784d6771a3d38e4273e changes: runs-on: ubuntu-latest @@ -83,7 +83,7 @@ jobs: VLLM_USE_MODELSCOPE: True strategy: matrix: - vllm_version: [52d0cb845866869d587fc013a7c59e60a86ebcf2, v0.10.2] + vllm_version: [17b4c6685ce62d5652654784d6771a3d38e4273e, v0.10.2] steps: - name: Install packages run: | @@ -138,7 +138,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [52d0cb845866869d587fc013a7c59e60a86ebcf2, v0.10.2] + vllm_version: [17b4c6685ce62d5652654784d6771a3d38e4273e, v0.10.2] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/vllm_ascend_test_full.yaml b/.github/workflows/vllm_ascend_test_full.yaml index 1d628dd..6306032 100644 --- a/.github/workflows/vllm_ascend_test_full.yaml +++ b/.github/workflows/vllm_ascend_test_full.yaml @@ -68,7 +68,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [52d0cb845866869d587fc013a7c59e60a86ebcf2, v0.10.2] + vllm_version: [17b4c6685ce62d5652654784d6771a3d38e4273e, v0.10.2] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py b/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py index 4bbd6d3..10705d3 100644 --- a/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py +++ b/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py @@ -1,9 +1,6 @@ import torch from torch.nn.parameter import Parameter from vllm.logger import init_logger -# yapf: disable -from vllm.model_executor.parameter import ModelWeightParameter -# yapf: enable from vllm.model_executor.utils import set_weight_attrs from vllm.utils import GiB_bytes @@ -16,27 +13,15 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): - from vllm_ascend.ascend_config import get_ascend_config - ascend_config = get_ascend_config() # This method creates unquantized linear weights. # The weights are not quantized, and they are not sharded. # The amount of memory allocated for the weights is # sum(output_partition_sizes) * input_size_per_partition. try: - if ascend_config.torchair_graph_config.enabled: - weight = Parameter(torch.empty(sum(output_partition_sizes), - input_size_per_partition, - dtype=params_dtype), - requires_grad=False) - else: - weight_loader = extra_weight_attrs.pop("weight_loader") - weight = ModelWeightParameter(data=torch.empty( - sum(output_partition_sizes), - input_size_per_partition, - dtype=params_dtype), - input_dim=1, - output_dim=0, - weight_loader=weight_loader) + weight = Parameter(torch.empty(sum(output_partition_sizes), + input_size_per_partition, + dtype=params_dtype), + requires_grad=False) except torch.cuda.OutOfMemoryError as e: logger.error("Failed to create unquantized linear weights: %s", e) if torch.cuda.is_available(): @@ -49,8 +34,7 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, "Failed to create unquantized linear weights. " "This may be caused by insufficient memory to allocate " "the weight.") from e - if ascend_config.torchair_graph_config.enabled: - set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) + set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) layer.register_parameter("weight", weight) set_weight_attrs(weight, extra_weight_attrs) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index f00abca..f25f984 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -209,6 +209,11 @@ class NPUPlatform(Platform): # set cudaprah sizes before extending `compilation_config.splitting_ops` vllm_config._set_cudagraph_sizes() + # TODO: Full graph is fully supported later, and the default value will be set to full graph. + if not vllm_version_is("v0.10.2"): + if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE: + compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE + if compilation_config.cudagraph_mode == CUDAGraphMode.NONE: compilation_config.level = CompilationLevel.NO_COMPILATION # TODO: Currently MLA does not support FULL_DECODE_ONLY, remove the second condition diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index 1a5e74d..130251c 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -33,7 +33,6 @@ from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.model_executor.layers.vocab_parallel_embedding import ( UnquantizedEmbeddingMethod, VocabParallelEmbedding) -from vllm.model_executor.parameter import PerTensorScaleParameter from vllm.model_executor.utils import set_weight_attrs from vllm_ascend.distributed.parallel_state import (get_mlp_tp_group, @@ -251,7 +250,6 @@ class AscendLinearMethod(LinearMethodBase): **extra_weight_attrs, ) -> None: output_size_per_partition = sum(output_partition_sizes) - weight_loader = extra_weight_attrs.get("weight_loader") weight_dict = self.quant_method.get_weight(input_size_per_partition, output_size_per_partition, @@ -264,8 +262,7 @@ class AscendLinearMethod(LinearMethodBase): pertensor_dict = self.quant_method.get_pertensor_param(params_dtype) for pertensor_name, pertensor_param in pertensor_dict.items(): - param = PerTensorScaleParameter(data=pertensor_param, - weight_loader=weight_loader) + param = torch.nn.Parameter(pertensor_param, requires_grad=False) # disable warning param.ignore_warning = True layer.register_parameter(pertensor_name, param) diff --git a/vllm_ascend/torchair/torchair_worker.py b/vllm_ascend/torchair/torchair_worker.py index ec3f1aa..dbee800 100644 --- a/vllm_ascend/torchair/torchair_worker.py +++ b/vllm_ascend/torchair/torchair_worker.py @@ -28,20 +28,6 @@ from vllm_ascend.worker.worker_v1 import NPUWorker class NPUTorchairWorker(NPUWorker): """Torchair worker bases on NPUWorker. Only torchair specified code should be added in this class.""" - def __init__(self, - vllm_config, - local_rank, - rank, - distributed_init_method, - is_driver_worker=False, - **kwargs): - super().__init__(vllm_config, local_rank, rank, - distributed_init_method, is_driver_worker, **kwargs) - from vllm.model_executor.layers.linear import \ - WEIGHT_LOADER_V2_SUPPORTED - if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED: - WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod") - def determine_available_memory(self) -> int: """Override determine_available_memory to use cached torchair kv_cache_bytes.""" diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 124fdcd..4d9e338 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -64,11 +64,12 @@ from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingType -from vllm.sequence import IntermediateTensors, PoolerOutput +from vllm.sequence import IntermediateTensors from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, LazyLoader, cdiv, get_dtype_size, is_pin_memory_available) +from vllm.utils.jsontree import json_map_leaves from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder from vllm.v1.attention.backends.utils import ( AttentionCGSupport, reorder_batch_to_split_decodes_and_prefills) @@ -144,7 +145,9 @@ else: if not vllm_version_is("0.10.2"): from vllm.v1.kv_cache_interface import UniformTypeKVCacheSpecs + from vllm.v1.outputs import PoolerOutput else: + from vllm.sequence import PoolerOutput UniformTypeKVCacheSpecs = None @@ -1806,18 +1809,30 @@ class NPUModelRunner(LoRAModelRunnerMixin): device=hidden_states.device) seq_lens_cpu = self.seq_lens_cpu[:self.input_batch.num_reqs] - # Pooling models D2H & synchronize occurs in pooler.py:build_output - raw_pooler_output = self.model.pooler( - hidden_states=hidden_states, pooling_metadata=pooling_metadata) + if vllm_version_is("0.10.2"): + # Pooling models D2H & synchronize occurs in pooler.py:build_output + raw_pooler_output = self.model.pooler( + hidden_states=hidden_states, pooling_metadata=pooling_metadata) + else: + model = cast(VllmModelForPooling, self.model) + raw_pooler_output = model.pooler( + hidden_states=hidden_states, + pooling_metadata=pooling_metadata, + ) + raw_pooler_output = json_map_leaves( + lambda x: x.to("cpu", non_blocking=True), + raw_pooler_output, + ) + torch.npu.synchronize() pooler_output: list[Optional[torch.Tensor]] = [] for raw_output, seq_len, prompt_len in zip( raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens): - - if seq_len == prompt_len: - pooler_output.append(raw_output.data) + if vllm_version_is("0.10.2"): + output = raw_output.data if seq_len == prompt_len else None else: - pooler_output.append(None) + output = raw_output if seq_len == prompt_len else None + pooler_output.append(output) return ModelRunnerOutput( req_ids=self.input_batch.req_ids, @@ -2582,7 +2597,10 @@ class NPUModelRunner(LoRAModelRunnerMixin): for task in self.get_supported_pooling_tasks(): # Run a full batch with each task to ensure none of them OOMs output = self._dummy_pooler_run_task(hidden_states, task) - output_size[task] = output.get_data_nbytes() + if vllm_version_is("0.10.2"): + output_size[task] = output.get_data_nbytes() + else: + output_size[task] = sum(o.nbytes for o in output) del output # Allow GC max_task = max(output_size.items(), key=lambda x: x[1])[0] diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index fedec87..c1fc800 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -116,6 +116,12 @@ class NPUWorker(WorkerBase): # Buffers saved before sleep self._sleep_saved_buffers: dict[str, torch.Tensor] = {} + # FixMe: this is a patch to fix the issue cause by https://github.com/vllm-project/vllm/commit/de94289a98d7ec52a5ef02719e01a1db8b505170 + from vllm.model_executor.layers.linear import \ + WEIGHT_LOADER_V2_SUPPORTED + if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED: + WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod") + def sleep(self, level: int = 1) -> None: if not sleep_mode_enabled(): raise ValueError(