From a055183821120f66b44b514b1795c51ec142df2e Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Thu, 25 Sep 2025 07:36:51 +0800 Subject: [PATCH] [CI] Upgrade vLLM version (#3139) Upgrade vLLM version to the newest commit. - Fix the break change introduced by https://github.com/vllm-project/vllm/commit/969b4da3a6ab737f72cb33db502b4c0bb70d4139 - Add a patch to quick fix torhcair https://github.com/vllm-project/vllm/commit/de94289a98d7ec52a5ef02719e01a1db8b505170 - fix the ut error introduced by https://github.com/vllm-project/vllm/commit/de94289a98d7ec52a5ef02719e01a1db8b505170 Close: https://github.com/vllm-project/vllm-ascend/issues/3138 - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/f225ea7dd98e9f29752e5c032cd4a8ee1d712f16 --------- Signed-off-by: wangxiyuan Signed-off-by: MengqingCao Co-authored-by: MengqingCao --- .github/workflows/format_pr_body.yaml | 2 +- .github/workflows/vllm_ascend_test.yaml | 6 +- .github/workflows/vllm_ascend_test_full.yaml | 2 +- tests/ut/models/test_qwen2_5_vl.py | 4 ++ tests/ut/ops/test_linear.py | 4 ++ .../patch/worker/patch_common/__init__.py | 1 + .../patch_common/patch_weight_loader.py | 60 +++++++++++++++++++ vllm_ascend/torchair/torchair_worker.py | 14 +++++ vllm_ascend/worker/model_runner_v1.py | 27 +++++---- 9 files changed, 105 insertions(+), 15 deletions(-) create mode 100644 vllm_ascend/patch/worker/patch_common/patch_weight_loader.py diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml index 56d85b0..b053c51 100644 --- a/.github/workflows/format_pr_body.yaml +++ b/.github/workflows/format_pr_body.yaml @@ -36,7 +36,7 @@ jobs: - name: Get vLLM version run: | - VLLM_COMMIT=f225ea7dd98e9f29752e5c032cd4a8ee1d712f16 + VLLM_COMMIT=b1068903fdca26cf6b4a1a51a32c3365ce3ac636 echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV - name: Checkout repository diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index a26538c..49052b5 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -42,7 +42,7 @@ jobs: lint: uses: ./.github/workflows/pre-commit.yml with: - vllm: f225ea7dd98e9f29752e5c032cd4a8ee1d712f16 + vllm: b1068903fdca26cf6b4a1a51a32c3365ce3ac636 changes: runs-on: ubuntu-latest @@ -83,7 +83,7 @@ jobs: VLLM_USE_MODELSCOPE: True strategy: matrix: - vllm_version: [f225ea7dd98e9f29752e5c032cd4a8ee1d712f16, v0.10.2] + vllm_version: [b1068903fdca26cf6b4a1a51a32c3365ce3ac636, v0.10.2] steps: - name: Install packages run: | @@ -138,7 +138,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [f225ea7dd98e9f29752e5c032cd4a8ee1d712f16, v0.10.2] + vllm_version: [b1068903fdca26cf6b4a1a51a32c3365ce3ac636, v0.10.2] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/vllm_ascend_test_full.yaml b/.github/workflows/vllm_ascend_test_full.yaml index e5836b5..d5144cc 100644 --- a/.github/workflows/vllm_ascend_test_full.yaml +++ b/.github/workflows/vllm_ascend_test_full.yaml @@ -68,7 +68,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [f225ea7dd98e9f29752e5c032cd4a8ee1d712f16, v0.10.2] + vllm_version: [b1068903fdca26cf6b4a1a51a32c3365ce3ac636, v0.10.2] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/tests/ut/models/test_qwen2_5_vl.py b/tests/ut/models/test_qwen2_5_vl.py index e9982f9..06fb07d 100644 --- a/tests/ut/models/test_qwen2_5_vl.py +++ b/tests/ut/models/test_qwen2_5_vl.py @@ -298,6 +298,10 @@ class TestAscendQwen2_5_VisionTransformer(PytestBase): "vllm_ascend.ops.linear_op.get_tp_group", return_value=mock_group, ) + mocker.patch( + "vllm.distributed.parallel_state.get_tp_group", + return_value=mock_group, + ) vision_transformer = AscendQwen2_5_VisionTransformer( vision_config, diff --git a/tests/ut/ops/test_linear.py b/tests/ut/ops/test_linear.py index 083416e..e22d7ca 100644 --- a/tests/ut/ops/test_linear.py +++ b/tests/ut/ops/test_linear.py @@ -33,6 +33,10 @@ class BaseLinearTest(unittest.TestCase): return_value=self.mock_group), patch("vllm_ascend.ops.linear_op.get_tp_group", return_value=self.mock_group), + patch( + "vllm.distributed.parallel_state.get_tp_group", + return_value=self.mock_group, + ), patch("vllm_ascend.utils.mlp_tp_enable", return_value=True), patch("vllm_ascend.utils.oproj_tp_enable", return_value=True) ] diff --git a/vllm_ascend/patch/worker/patch_common/__init__.py b/vllm_ascend/patch/worker/patch_common/__init__.py index 37407b4..baf5321 100644 --- a/vllm_ascend/patch/worker/patch_common/__init__.py +++ b/vllm_ascend/patch/worker/patch_common/__init__.py @@ -22,6 +22,7 @@ if HAS_TRITON: import vllm_ascend.patch.worker.patch_common.patch_distributed # noqa import vllm_ascend.patch.worker.patch_common.patch_logits # noqa +import vllm_ascend.patch.worker.patch_common.patch_weight_loader # noqa # TODO: revert me when triton import is fixed # import vllm_ascend.patch.worker.patch_common.patch_minicpm # noqa diff --git a/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py b/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py new file mode 100644 index 0000000..4bbd6d3 --- /dev/null +++ b/vllm_ascend/patch/worker/patch_common/patch_weight_loader.py @@ -0,0 +1,60 @@ +import torch +from torch.nn.parameter import Parameter +from vllm.logger import init_logger +# yapf: disable +from vllm.model_executor.parameter import ModelWeightParameter +# yapf: enable +from vllm.model_executor.utils import set_weight_attrs +from vllm.utils import GiB_bytes + +from vllm_ascend.utils import vllm_version_is + +logger = init_logger(__name__) + + +def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, + output_partition_sizes: list[int], input_size: int, + output_size: int, params_dtype: torch.dtype, + **extra_weight_attrs): + from vllm_ascend.ascend_config import get_ascend_config + ascend_config = get_ascend_config() + # This method creates unquantized linear weights. + # The weights are not quantized, and they are not sharded. + # The amount of memory allocated for the weights is + # sum(output_partition_sizes) * input_size_per_partition. + try: + if ascend_config.torchair_graph_config.enabled: + weight = Parameter(torch.empty(sum(output_partition_sizes), + input_size_per_partition, + dtype=params_dtype), + requires_grad=False) + else: + weight_loader = extra_weight_attrs.pop("weight_loader") + weight = ModelWeightParameter(data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition, + dtype=params_dtype), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) + except torch.cuda.OutOfMemoryError as e: + logger.error("Failed to create unquantized linear weights: %s", e) + if torch.cuda.is_available(): + logger.debug("CUDA device: %s", torch.cuda.current_device()) + logger.debug("Allocated: %.2f GiB", + torch.cuda.memory_allocated() / GiB_bytes) + logger.debug("Reserved: %.2f GiB", + torch.cuda.memory_reserved() / GiB_bytes) + raise RuntimeError( + "Failed to create unquantized linear weights. " + "This may be caused by insufficient memory to allocate " + "the weight.") from e + if ascend_config.torchair_graph_config.enabled: + set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) + layer.register_parameter("weight", weight) + set_weight_attrs(weight, extra_weight_attrs) + + +if not vllm_version_is("0.10.2"): + from vllm.model_executor.layers.linear import UnquantizedLinearMethod + UnquantizedLinearMethod.create_weights = create_weights diff --git a/vllm_ascend/torchair/torchair_worker.py b/vllm_ascend/torchair/torchair_worker.py index dbee800..ec3f1aa 100644 --- a/vllm_ascend/torchair/torchair_worker.py +++ b/vllm_ascend/torchair/torchair_worker.py @@ -28,6 +28,20 @@ from vllm_ascend.worker.worker_v1 import NPUWorker class NPUTorchairWorker(NPUWorker): """Torchair worker bases on NPUWorker. Only torchair specified code should be added in this class.""" + def __init__(self, + vllm_config, + local_rank, + rank, + distributed_init_method, + is_driver_worker=False, + **kwargs): + super().__init__(vllm_config, local_rank, rank, + distributed_init_method, is_driver_worker, **kwargs) + from vllm.model_executor.layers.linear import \ + WEIGHT_LOADER_V2_SUPPORTED + if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED: + WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod") + def determine_available_memory(self) -> int: """Override determine_available_memory to use cached torchair kv_cache_bytes.""" diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index f4656dd..670e69a 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -304,17 +304,24 @@ class NPUModelRunner(LoRAModelRunnerMixin): (self.max_num_tokens, self.model_config.get_hidden_size()), dtype=self.dtype, device=self.device) - # Set up Attention - self.attn_backend = get_attn_backend( - 0, - self.dtype, - None, - self.block_size, - self.model_config.is_attention_free, - use_mla=self.model_config.use_mla, - ) - + if vllm_version_is("0.10.2"): + self.attn_backend = get_attn_backend( + 0, + self.dtype, + None, + self.block_size, + self.model_config.is_attention_free, + use_mla=self.model_config.use_mla, + ) + else: + self.attn_backend = get_attn_backend( + 0, + self.dtype, + None, + self.block_size, + use_mla=self.model_config.use_mla, + ) if torch.version.cann.startswith("8.3"): self.attn_mask_builder = AttentionMaskBuilder( self.scheduler_config.max_num_batched_tokens, self.dtype,