From 484e7c59dcbe6ece47c9591ef1e6fbca69bb57c3 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Thu, 22 Jan 2026 15:46:59 +0800 Subject: [PATCH] [CI] optimize lint term (#5986) ### What this PR does / why we need it? This patch purpose to optimize the lint check term. The main idea is to reduce unnecessary installation time. 1. The installation of vllm is not must, only append the path of vllm src to the `PATHONPATH` is effective 2. This installation of `requirements-dev.txt` is not must, we have a pre-built image `quay.io/ascend-ci/vllm-ascend:lint` with all the requirements installed in advance. **NOTE**: the conditions for triggering image builds are: 1).Daily scheduled build; 2) Build when requirements are modified; 3) Manual build. This ensures that the dependencies in our image are up-to-date to the greatest extent possible. 3. The `mypy` was separated from the `pre-commit` hook for performance reasons; we found that integrating `mypy` into the `pre-commit` hook resulted in poor performance. 4. Reduce the CPU core consumption from 16 -> 8 ### Does this PR introduce _any_ user-facing change? The end-to-end lint time was optimized from 20min/per PR to 8min/per PR ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 --------- Signed-off-by: wangli --- .github/workflows/_pre_commit.yml | 39 ++++++++--- .github/workflows/dockerfiles/Dockerfile.lint | 46 +++++++++++++ .../workflows/schedule_lint_image_build.yaml | 67 +++++++++++++++++++ .pre-commit-config.yaml | 34 ---------- tests/e2e/conftest.py | 4 +- tools/mypy.sh | 8 ++- vllm_ascend/_310p/attention/attention_v1.py | 15 ++++- vllm_ascend/attention/attention_v1.py | 7 +- .../kv_transfer/kv_p2p/mooncake_connector.py | 45 ++++++++----- vllm_ascend/xlite/xlite.py | 8 +-- 10 files changed, 196 insertions(+), 77 deletions(-) create mode 100644 .github/workflows/dockerfiles/Dockerfile.lint create mode 100644 .github/workflows/schedule_lint_image_build.yaml diff --git a/.github/workflows/_pre_commit.yml b/.github/workflows/_pre_commit.yml index dbe30a8e..dc848c2a 100644 --- a/.github/workflows/_pre_commit.yml +++ b/.github/workflows/_pre_commit.yml @@ -12,19 +12,16 @@ permissions: jobs: pre-commit: - runs-on: linux-amd64-cpu-16-hk + runs-on: linux-amd64-cpu-8-hk container: # Build it from https://github.com/nv-action/vllm-benchmarks/blob/main/Dockerfile - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-cpu - env: - GOPROXY: https://goproxy.io,direct - GITHUB_WORKSPACE: /__w/vllm-ascend/vllm-ascend + image: quay.io/ascend-ci/vllm-ascend:lint steps: - name: Checkout vllm-project/vllm-ascend repo uses: actions/checkout@v6 # With problem matchers in a container, the output of $GITHUB_WORKSPACE and ${{ github.workspace }} are different. - # So we will just copy it into the path ${{ github.workspace }}. see https://github.com/actions/runner/issues/2058 + # So we will just copy it into a temp path. see https://github.com/actions/runner/issues/2058 - name: cp problem matchers run: | cp .github/workflows/matchers/actionlint.json "$RUNNER_TEMP/actionlint.json" @@ -41,14 +38,21 @@ jobs: repository: vllm-project/vllm path: ./vllm-empty ref: ${{ inputs.vllm }} - - name: Install vllm - working-directory: vllm-empty - run: | - VLLM_TARGET_DEVICE=empty python3 -m pip install . --extra-index https://download.pytorch.org/whl/cpu/ - - name: Install vllm-ascend dev + - uses: dorny/paths-filter@v3 + id: filter + with: + filters: | + lint_tracker: + - 'requirements.txt' + - 'requirements-dev.txt' + - 'requirements-lint.txt' + + - name: Install vllm-ascend dev (conditional) + if: steps.filter.outputs.lint_tracker == 'true' run: | git config --global --add safe.directory /__w/vllm-ascend/vllm-ascend pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu + - name: Run pre-commit env: PRE_COMMIT_COLOR: always @@ -56,4 +60,17 @@ jobs: TERM: xterm-256color SHELLCHECK_OPTS: "--exclude=SC2046,SC2006,SC2086" # Exclude SC2046, SC2006, SC2086 for actionlint run: | + git config --global --add safe.directory /__w/vllm-ascend/vllm-ascend pre-commit run --all-files --hook-stage manual --show-diff-on-failure + - name: Run mypy + run: | + PYTHONPATH="$PYTHONPATH:$(pwd)/vllm-empty" + export PYTHONPATH + git config --global --add safe.directory /__w/vllm-ascend/vllm-ascend + # Run mypy for Python 3.10, 3.11, 3.12 manually + # Note: We are now separating mypy from pre-commit hooks for performance reasons. + for python_version in "3.10" "3.11" "3.12"; do + echo "============================" + tools/mypy.sh 1 "$python_version" + echo "============================" + done diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint new file mode 100644 index 00000000..b01e7b7f --- /dev/null +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -0,0 +1,46 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +FROM ascendai/python:3.11-ubuntu22.04 + +ARG TARGETARCH + +RUN apt-get update -y && \ + apt-get install -y curl git gcc g++ cmake libnuma-dev jq && \ + rm -rf /var/cache/apt/* && \ + rm -rf /var/lib/apt/lists/* + + +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +# For lint purpose, actually we need make a main2main matching. +ARG VLLM_COMMIT=d68209402ddab3f54a09bc1f4de9a9495a283b60 +RUN git clone $VLLM_REPO /vllm-workspace/vllm && \ + cd /vllm-workspace/vllm && \ + git checkout $VLLM_COMMIT + +# # Install vLLM common dependencies +RUN python3 -m pip install -r /vllm-workspace/vllm/requirements/common.txt --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip uninstall -y triton && \ + python3 -m pip cache purge + +COPY . /vllm-workspace/vllm-ascend/ + +RUN pip install -r /vllm-workspace/vllm-ascend/requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu && \ + pip cache purge && \ + rm -fr /vllm-workspace/ + +CMD ["/bin/bash"] diff --git a/.github/workflows/schedule_lint_image_build.yaml b/.github/workflows/schedule_lint_image_build.yaml new file mode 100644 index 00000000..3dc86095 --- /dev/null +++ b/.github/workflows/schedule_lint_image_build.yaml @@ -0,0 +1,67 @@ +name: 'Image build lint' +on: + schedule: + # Runs at 00:00 UTC+8 every day + - cron: '0 20 * * *' + workflow_dispatch: + push: + paths: + - 'Dockerfile.lint' + - 'requirements-lint.txt' + - 'requirements-dev.txt' + - 'requirements.txt' + +# only cancel in-progress runs of the same workflow +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + + build: + name: vllm-ascend lint image build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + persist-credentials: false + + - name: Print + run: | + lscpu + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: | + quay.io/ascend-ci/vllm-ascend + tags: lint + flavor: + latest=false + + - name: Build - Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Build - Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Publish - Login to Quay Container Registry + if: ${{ github.repository_owner == 'vllm-project' }} + uses: docker/login-action@v3 + with: + registry: quay.io + username: ${{ vars.QUAY_CI_USERNAME }} + password: ${{ secrets.QUAY_CI_PASSWORD }} + + - name: Build and push + uses: docker/build-push-action@v6 + with: + # For now, we only build amd64 lint image + platforms: 'linux/amd64' + context: . + file: .github/workflows/dockerfiles/Dockerfile.lint + push: true + labels: ${{ steps.meta.outputs.labels }} + tags: ${{ steps.meta.outputs.tags }} + provenance: false diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fd2dc626..e18a67be 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,40 +38,6 @@ repos: - id: actionlint - repo: local hooks: - # For local development, you can run mypy using tools/mypy.sh script if needed. - # - id: mypy-local - # name: Run mypy for local Python installation - # entry: tools/mypy.sh 0 "local" - # language: system - # types: [python] - # stages: [pre-commit] # Don't run in CI - - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - name: Run mypy for Python 3.10 - entry: tools/mypy.sh 1 "3.10" - # Use system python because vllm installation is required - language: system - types: [python] - stages: [manual] # Only run in CI - - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - name: Run mypy for Python 3.11 - entry: tools/mypy.sh 1 "3.11" - # Use system python because vllm installation is required - language: system - types: [python] - stages: [manual] # Only run in CI - - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - name: Run mypy for Python 3.12 - entry: tools/mypy.sh 1 "3.12" - # Use system python because vllm installation is required - language: system - types: [python] - stages: [manual] # Only run in CI - # FIXME: enable shellcheck - # - id: shellcheck - # name: Lint shell scripts - # entry: tools/shellcheck.sh - # language: script - # types: [shell] - id: png-lint name: Lint PNG exports from excalidraw entry: tools/png-lint.sh diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 1d7fd9ec..400d9778 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -453,9 +453,9 @@ class VllmRunner: if images is not None and (image := images[i]) is not None: multi_modal_data["image"] = image if videos is not None and (video := videos[i]) is not None: - multi_modal_data["video"] = video + multi_modal_data["video"] = video # type: ignore if audios is not None and (audio := audios[i]) is not None: - multi_modal_data["audio"] = audio + multi_modal_data["audio"] = audio # type: ignore text_prompt_kwargs: dict[str, Any] = { "multi_modal_data": multi_modal_data or None diff --git a/tools/mypy.sh b/tools/mypy.sh index caac0a1d..b8b14b0d 100755 --- a/tools/mypy.sh +++ b/tools/mypy.sh @@ -30,9 +30,13 @@ if [ $PYTHON_VERSION == "local" ]; then PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') fi +# Define colors +GREEN='\033[0;32m' +NC='\033[0m' # No Color + run_mypy() { - echo "Running mypy on $1" - mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@" + echo -e "${GREEN}Running mypy for $1 on python version: ${PYTHON_VERSION}${NC}" + mypy --follow-imports skip --check-untyped-defs --python-version "${PYTHON_VERSION}" "$@" } run_mypy vllm_ascend diff --git a/vllm_ascend/_310p/attention/attention_v1.py b/vllm_ascend/_310p/attention/attention_v1.py index 347c4bc1..9e3ccf2f 100644 --- a/vllm_ascend/_310p/attention/attention_v1.py +++ b/vllm_ascend/_310p/attention/attention_v1.py @@ -15,6 +15,7 @@ # This file is a part of the vllm-ascend project. # +from typing import Any import torch import torch_npu @@ -23,7 +24,7 @@ from vllm_ascend._310p.attention.attention_mask import AttentionMaskBuilder, bui from vllm_ascend._310p.attention.metadata_builder import AscendAttentionMetadataBuilder310P from vllm_ascend.attention.attention_v1 import AscendAttentionBackend as _BaseBackend from vllm_ascend.attention.attention_v1 import AscendAttentionBackendImpl as _BaseImpl -from vllm_ascend.attention.attention_v1 import AscendAttentionMetadataBuilder, AscendAttentionState +from vllm_ascend.attention.attention_v1 import AscendAttentionMetadataBuilder, AscendAttentionState, AscendMetadata from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, aligned_16, nd_to_nz_2d @@ -47,9 +48,17 @@ class AscendAttentionBackend310(_BaseBackend): class AscendAttentionBackendImpl310(_BaseImpl): - def forward_paged_attention(self, query, attn_metadata, output): + def forward_paged_attention( + self, + query: Any, + attn_metadata: AscendMetadata, + output: Any | None = None, + ) -> Any: if attn_metadata.seq_lens.device != query.device: - attn_metadata.seq_lens = attn_metadata.seq_lens.to(device=query.device, non_blocking=True) + attn_metadata.seq_lens = attn_metadata.seq_lens.to( + device=query.device, + non_blocking=True, + ) return super().forward_paged_attention(query, attn_metadata, output) def _forward_prefill_310p_fallback(self, query, key, value, attn_metadata, output): diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 933fde28..6c45b6dc 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -46,9 +46,7 @@ from vllm_ascend.device.device_op import DeviceOperator from vllm_ascend.ops.flashcomm2_oshard_manager import flashcomm2_oshard_manager from vllm_ascend.utils import vllm_version_is, weak_ref_tensors -# isort: off if vllm_version_is("0.13.0"): - from vllm.v1.attention.backends.utils import AttentionCGSupport, AttentionMetadataBuilder from vllm.attention.backends.abstract import ( # type: ignore AttentionBackend, AttentionImpl, @@ -59,20 +57,21 @@ if vllm_version_is("0.13.0"): AttentionBackendEnum, register_backend, ) + from vllm.v1.attention.backends.utils import AttentionCGSupport, AttentionMetadataBuilder else: from vllm.v1.attention.backend import ( # type: ignore AttentionBackend, AttentionCGSupport, AttentionImpl, AttentionLayer, - AttentionType, AttentionMetadataBuilder, + AttentionType, ) from vllm.v1.attention.backends.registry import ( # type: ignore AttentionBackendEnum, register_backend, ) -# isort: on + # default max value of sliding window size SWA_INT_MAX = 2147483647 diff --git a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py index a54ef6cb..ee07e5f7 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py +++ b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py @@ -13,7 +13,7 @@ from collections import defaultdict, deque from collections.abc import Iterator from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, List, Optional, OrderedDict, Tuple +from typing import TYPE_CHECKING, Any, List, Optional, OrderedDict, Tuple, TypedDict import msgspec import numpy as np @@ -60,6 +60,11 @@ GET_META_MSG = b"get_meta_msg" DONE_RECVING_MSG = b"done_recving_msg" +class RemotePortInfo(TypedDict): + num: int + host: str + + class MooncakeAgentMetadata(msgspec.Struct, omit_defaults=True, dict=True): engine_id: str te_rpc_port: int @@ -384,7 +389,7 @@ class KVCacheRecvingThread(threading.Thread): remote_handshake_port: int, offset: int, tp_num_need_pulls: int, - remote_port_send_num: dict[int, dict[str, int | str]] = {}, + remote_port_send_num: dict[int, RemotePortInfo] = {}, all_task_done: bool = False): """Add a new request to the queue for processing.""" logger.debug(f"Adding request {request_id} to the queue.") @@ -458,8 +463,9 @@ class KVCacheRecvingThread(threading.Thread): self._send_done_signal_to_free_remote_port(remote_request_id, remote_host, remote_port_send_num) - def _send_done_signal_to_free_remote_port(self, request_id, remote_host, - remote_port_send_num): + def _send_done_signal_to_free_remote_port( + self, request_id: str, remote_host: str, + remote_port_send_num: dict[int, RemotePortInfo]): if self.side_channel_port != self.local_handshake_port \ or not remote_port_send_num: return @@ -708,9 +714,10 @@ class KVCacheRecvingThread(threading.Thread): logger.debug("Returned socket to pool for %s:%d", remote_host, remote_handshake_port) - def _send_done_recv_signal(self, request_id: str, remote_host: str, - remote_handshake_port: int, - remote_port_send_num: dict[int, dict[str, int | str]]): + def _send_done_recv_signal( + self, request_id: str, remote_host: str, + remote_handshake_port: int, + remote_port_send_num: dict[int, RemotePortInfo]): logger.debug("Sending done recving signal for request %s to %s:%d", request_id, remote_host, remote_handshake_port) sock: Optional[zmq.Socket] = None # type: ignore @@ -1177,7 +1184,7 @@ class MooncakeConnectorWorker: self.tp_num_need_pulls = num_d_block_heads // num_p_block_heads self.local_remote_block_port_mapping: dict[ str, Optional[List[List[int]]]] = {} - self.remote_port_send_num: dict[str, dict[int, dict[str, int | str]]] = {} + self.remote_port_send_num: dict[str, dict[int, RemotePortInfo]] = {} def _get_prefill_decode_size(self, vllm_config: VllmConfig): # get prefill tp and dp size from extra config @@ -1463,16 +1470,20 @@ class MooncakeConnectorWorker: return local_remote_block_port_mappings - def get_remote_port_send_num(local_remote_block_port_mappings): - remote_port_send_num: dict[int, dict[str, int | str]] = {} + def get_remote_port_send_num( + local_remote_block_port_mappings: dict[int, list[list[int]]] + ) -> dict[int, RemotePortInfo]: + remote_port_send_num: dict[int, RemotePortInfo] = {} for port in range(self._prefill_tp_size * meta.remote_pcp_size): - remote_host = meta.remote_multi_nodes_meta_mapping[str(port)]['host'] - remote_port_send_num[meta.remote_port + port] = {} - remote_port_send_num[meta.remote_port + port]['num'] = 0 - remote_port_send_num[meta.remote_port + port]['host'] = remote_host - for local_port in local_remote_block_port_mappings.keys(): - remote_port_head_list = local_remote_block_port_mappings[ - local_port] + remote_host = str(meta.remote_multi_nodes_meta_mapping[str( + port)]['host']) + remote_port_send_num[meta.remote_port + port] = { + 'num': 0, + 'host': remote_host + } + + for remote_port_head_list in local_remote_block_port_mappings.values( + ): for remote_port_list in remote_port_head_list: for remote_port in remote_port_list: remote_port_send_num[remote_port]['num'] += 1 diff --git a/vllm_ascend/xlite/xlite.py b/vllm_ascend/xlite/xlite.py index f3007a6f..aa21a944 100644 --- a/vllm_ascend/xlite/xlite.py +++ b/vllm_ascend/xlite/xlite.py @@ -25,7 +25,7 @@ from vllm.distributed import (get_ep_group, from vllm.forward_context import get_forward_context from vllm.logger import logger from vllm.sequence import IntermediateTensors -from xlite._C import (AttnMHA, Model, ModelAttnMeta, ModelConfig, Runtime, +from xlite._C import (AttnMHA, Model, ModelAttnMeta, ModelConfig, Runtime, # type: ignore[attr-defined] ScoringFuncSoftmax) import vllm_ascend.envs as envs_ascend @@ -214,10 +214,10 @@ class QwenMoeXliteModel(LlamaXliteModel): config.def_dp_size = vllm_config.parallel_config.data_parallel_size config.moe_ep_size = ep_group.world_size if vllm_config.parallel_config.enable_expert_parallel else 1 config.moe_tp_size = 1 if vllm_config.parallel_config.enable_expert_parallel else ep_group.world_size - config.experts_weight_transpose = True + config.experts_weight_transpose = True # type: ignore config.moe_intermediate_size = hf_config.moe_intermediate_size - config.norm_topk_prob = hf_config.norm_topk_prob - config.scoring_func = ScoringFuncSoftmax + config.norm_topk_prob = hf_config.norm_topk_prob # type: ignore + config.scoring_func = ScoringFuncSoftmax # type: ignore return config def _build_model(self, runnable: nn.Module, vllm_config: VllmConfig,