diff --git a/.github/workflows/_pre_commit.yml b/.github/workflows/_pre_commit.yml index dbe30a8e..dc848c2a 100644 --- a/.github/workflows/_pre_commit.yml +++ b/.github/workflows/_pre_commit.yml @@ -12,19 +12,16 @@ permissions: jobs: pre-commit: - runs-on: linux-amd64-cpu-16-hk + runs-on: linux-amd64-cpu-8-hk container: # Build it from https://github.com/nv-action/vllm-benchmarks/blob/main/Dockerfile - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-cpu - env: - GOPROXY: https://goproxy.io,direct - GITHUB_WORKSPACE: /__w/vllm-ascend/vllm-ascend + image: quay.io/ascend-ci/vllm-ascend:lint steps: - name: Checkout vllm-project/vllm-ascend repo uses: actions/checkout@v6 # With problem matchers in a container, the output of $GITHUB_WORKSPACE and ${{ github.workspace }} are different. - # So we will just copy it into the path ${{ github.workspace }}. see https://github.com/actions/runner/issues/2058 + # So we will just copy it into a temp path. see https://github.com/actions/runner/issues/2058 - name: cp problem matchers run: | cp .github/workflows/matchers/actionlint.json "$RUNNER_TEMP/actionlint.json" @@ -41,14 +38,21 @@ jobs: repository: vllm-project/vllm path: ./vllm-empty ref: ${{ inputs.vllm }} - - name: Install vllm - working-directory: vllm-empty - run: | - VLLM_TARGET_DEVICE=empty python3 -m pip install . --extra-index https://download.pytorch.org/whl/cpu/ - - name: Install vllm-ascend dev + - uses: dorny/paths-filter@v3 + id: filter + with: + filters: | + lint_tracker: + - 'requirements.txt' + - 'requirements-dev.txt' + - 'requirements-lint.txt' + + - name: Install vllm-ascend dev (conditional) + if: steps.filter.outputs.lint_tracker == 'true' run: | git config --global --add safe.directory /__w/vllm-ascend/vllm-ascend pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu + - name: Run pre-commit env: PRE_COMMIT_COLOR: always @@ -56,4 +60,17 @@ jobs: TERM: xterm-256color SHELLCHECK_OPTS: "--exclude=SC2046,SC2006,SC2086" # Exclude SC2046, SC2006, SC2086 for actionlint run: | + git config --global --add safe.directory /__w/vllm-ascend/vllm-ascend pre-commit run --all-files --hook-stage manual --show-diff-on-failure + - name: Run mypy + run: | + PYTHONPATH="$PYTHONPATH:$(pwd)/vllm-empty" + export PYTHONPATH + git config --global --add safe.directory /__w/vllm-ascend/vllm-ascend + # Run mypy for Python 3.10, 3.11, 3.12 manually + # Note: We are now separating mypy from pre-commit hooks for performance reasons. + for python_version in "3.10" "3.11" "3.12"; do + echo "============================" + tools/mypy.sh 1 "$python_version" + echo "============================" + done diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint new file mode 100644 index 00000000..b01e7b7f --- /dev/null +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -0,0 +1,46 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +FROM ascendai/python:3.11-ubuntu22.04 + +ARG TARGETARCH + +RUN apt-get update -y && \ + apt-get install -y curl git gcc g++ cmake libnuma-dev jq && \ + rm -rf /var/cache/apt/* && \ + rm -rf /var/lib/apt/lists/* + + +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +# For lint purpose, actually we need make a main2main matching. +ARG VLLM_COMMIT=d68209402ddab3f54a09bc1f4de9a9495a283b60 +RUN git clone $VLLM_REPO /vllm-workspace/vllm && \ + cd /vllm-workspace/vllm && \ + git checkout $VLLM_COMMIT + +# # Install vLLM common dependencies +RUN python3 -m pip install -r /vllm-workspace/vllm/requirements/common.txt --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip uninstall -y triton && \ + python3 -m pip cache purge + +COPY . /vllm-workspace/vllm-ascend/ + +RUN pip install -r /vllm-workspace/vllm-ascend/requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu && \ + pip cache purge && \ + rm -fr /vllm-workspace/ + +CMD ["/bin/bash"] diff --git a/.github/workflows/schedule_lint_image_build.yaml b/.github/workflows/schedule_lint_image_build.yaml new file mode 100644 index 00000000..3dc86095 --- /dev/null +++ b/.github/workflows/schedule_lint_image_build.yaml @@ -0,0 +1,67 @@ +name: 'Image build lint' +on: + schedule: + # Runs at 00:00 UTC+8 every day + - cron: '0 20 * * *' + workflow_dispatch: + push: + paths: + - 'Dockerfile.lint' + - 'requirements-lint.txt' + - 'requirements-dev.txt' + - 'requirements.txt' + +# only cancel in-progress runs of the same workflow +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + + build: + name: vllm-ascend lint image build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + persist-credentials: false + + - name: Print + run: | + lscpu + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: | + quay.io/ascend-ci/vllm-ascend + tags: lint + flavor: + latest=false + + - name: Build - Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Build - Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Publish - Login to Quay Container Registry + if: ${{ github.repository_owner == 'vllm-project' }} + uses: docker/login-action@v3 + with: + registry: quay.io + username: ${{ vars.QUAY_CI_USERNAME }} + password: ${{ secrets.QUAY_CI_PASSWORD }} + + - name: Build and push + uses: docker/build-push-action@v6 + with: + # For now, we only build amd64 lint image + platforms: 'linux/amd64' + context: . + file: .github/workflows/dockerfiles/Dockerfile.lint + push: true + labels: ${{ steps.meta.outputs.labels }} + tags: ${{ steps.meta.outputs.tags }} + provenance: false diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fd2dc626..e18a67be 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,40 +38,6 @@ repos: - id: actionlint - repo: local hooks: - # For local development, you can run mypy using tools/mypy.sh script if needed. - # - id: mypy-local - # name: Run mypy for local Python installation - # entry: tools/mypy.sh 0 "local" - # language: system - # types: [python] - # stages: [pre-commit] # Don't run in CI - - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - name: Run mypy for Python 3.10 - entry: tools/mypy.sh 1 "3.10" - # Use system python because vllm installation is required - language: system - types: [python] - stages: [manual] # Only run in CI - - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - name: Run mypy for Python 3.11 - entry: tools/mypy.sh 1 "3.11" - # Use system python because vllm installation is required - language: system - types: [python] - stages: [manual] # Only run in CI - - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - name: Run mypy for Python 3.12 - entry: tools/mypy.sh 1 "3.12" - # Use system python because vllm installation is required - language: system - types: [python] - stages: [manual] # Only run in CI - # FIXME: enable shellcheck - # - id: shellcheck - # name: Lint shell scripts - # entry: tools/shellcheck.sh - # language: script - # types: [shell] - id: png-lint name: Lint PNG exports from excalidraw entry: tools/png-lint.sh diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 1d7fd9ec..400d9778 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -453,9 +453,9 @@ class VllmRunner: if images is not None and (image := images[i]) is not None: multi_modal_data["image"] = image if videos is not None and (video := videos[i]) is not None: - multi_modal_data["video"] = video + multi_modal_data["video"] = video # type: ignore if audios is not None and (audio := audios[i]) is not None: - multi_modal_data["audio"] = audio + multi_modal_data["audio"] = audio # type: ignore text_prompt_kwargs: dict[str, Any] = { "multi_modal_data": multi_modal_data or None diff --git a/tools/mypy.sh b/tools/mypy.sh index caac0a1d..b8b14b0d 100755 --- a/tools/mypy.sh +++ b/tools/mypy.sh @@ -30,9 +30,13 @@ if [ $PYTHON_VERSION == "local" ]; then PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') fi +# Define colors +GREEN='\033[0;32m' +NC='\033[0m' # No Color + run_mypy() { - echo "Running mypy on $1" - mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@" + echo -e "${GREEN}Running mypy for $1 on python version: ${PYTHON_VERSION}${NC}" + mypy --follow-imports skip --check-untyped-defs --python-version "${PYTHON_VERSION}" "$@" } run_mypy vllm_ascend diff --git a/vllm_ascend/_310p/attention/attention_v1.py b/vllm_ascend/_310p/attention/attention_v1.py index 347c4bc1..9e3ccf2f 100644 --- a/vllm_ascend/_310p/attention/attention_v1.py +++ b/vllm_ascend/_310p/attention/attention_v1.py @@ -15,6 +15,7 @@ # This file is a part of the vllm-ascend project. # +from typing import Any import torch import torch_npu @@ -23,7 +24,7 @@ from vllm_ascend._310p.attention.attention_mask import AttentionMaskBuilder, bui from vllm_ascend._310p.attention.metadata_builder import AscendAttentionMetadataBuilder310P from vllm_ascend.attention.attention_v1 import AscendAttentionBackend as _BaseBackend from vllm_ascend.attention.attention_v1 import AscendAttentionBackendImpl as _BaseImpl -from vllm_ascend.attention.attention_v1 import AscendAttentionMetadataBuilder, AscendAttentionState +from vllm_ascend.attention.attention_v1 import AscendAttentionMetadataBuilder, AscendAttentionState, AscendMetadata from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, aligned_16, nd_to_nz_2d @@ -47,9 +48,17 @@ class AscendAttentionBackend310(_BaseBackend): class AscendAttentionBackendImpl310(_BaseImpl): - def forward_paged_attention(self, query, attn_metadata, output): + def forward_paged_attention( + self, + query: Any, + attn_metadata: AscendMetadata, + output: Any | None = None, + ) -> Any: if attn_metadata.seq_lens.device != query.device: - attn_metadata.seq_lens = attn_metadata.seq_lens.to(device=query.device, non_blocking=True) + attn_metadata.seq_lens = attn_metadata.seq_lens.to( + device=query.device, + non_blocking=True, + ) return super().forward_paged_attention(query, attn_metadata, output) def _forward_prefill_310p_fallback(self, query, key, value, attn_metadata, output): diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 933fde28..6c45b6dc 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -46,9 +46,7 @@ from vllm_ascend.device.device_op import DeviceOperator from vllm_ascend.ops.flashcomm2_oshard_manager import flashcomm2_oshard_manager from vllm_ascend.utils import vllm_version_is, weak_ref_tensors -# isort: off if vllm_version_is("0.13.0"): - from vllm.v1.attention.backends.utils import AttentionCGSupport, AttentionMetadataBuilder from vllm.attention.backends.abstract import ( # type: ignore AttentionBackend, AttentionImpl, @@ -59,20 +57,21 @@ if vllm_version_is("0.13.0"): AttentionBackendEnum, register_backend, ) + from vllm.v1.attention.backends.utils import AttentionCGSupport, AttentionMetadataBuilder else: from vllm.v1.attention.backend import ( # type: ignore AttentionBackend, AttentionCGSupport, AttentionImpl, AttentionLayer, - AttentionType, AttentionMetadataBuilder, + AttentionType, ) from vllm.v1.attention.backends.registry import ( # type: ignore AttentionBackendEnum, register_backend, ) -# isort: on + # default max value of sliding window size SWA_INT_MAX = 2147483647 diff --git a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py index a54ef6cb..ee07e5f7 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py +++ b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py @@ -13,7 +13,7 @@ from collections import defaultdict, deque from collections.abc import Iterator from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, List, Optional, OrderedDict, Tuple +from typing import TYPE_CHECKING, Any, List, Optional, OrderedDict, Tuple, TypedDict import msgspec import numpy as np @@ -60,6 +60,11 @@ GET_META_MSG = b"get_meta_msg" DONE_RECVING_MSG = b"done_recving_msg" +class RemotePortInfo(TypedDict): + num: int + host: str + + class MooncakeAgentMetadata(msgspec.Struct, omit_defaults=True, dict=True): engine_id: str te_rpc_port: int @@ -384,7 +389,7 @@ class KVCacheRecvingThread(threading.Thread): remote_handshake_port: int, offset: int, tp_num_need_pulls: int, - remote_port_send_num: dict[int, dict[str, int | str]] = {}, + remote_port_send_num: dict[int, RemotePortInfo] = {}, all_task_done: bool = False): """Add a new request to the queue for processing.""" logger.debug(f"Adding request {request_id} to the queue.") @@ -458,8 +463,9 @@ class KVCacheRecvingThread(threading.Thread): self._send_done_signal_to_free_remote_port(remote_request_id, remote_host, remote_port_send_num) - def _send_done_signal_to_free_remote_port(self, request_id, remote_host, - remote_port_send_num): + def _send_done_signal_to_free_remote_port( + self, request_id: str, remote_host: str, + remote_port_send_num: dict[int, RemotePortInfo]): if self.side_channel_port != self.local_handshake_port \ or not remote_port_send_num: return @@ -708,9 +714,10 @@ class KVCacheRecvingThread(threading.Thread): logger.debug("Returned socket to pool for %s:%d", remote_host, remote_handshake_port) - def _send_done_recv_signal(self, request_id: str, remote_host: str, - remote_handshake_port: int, - remote_port_send_num: dict[int, dict[str, int | str]]): + def _send_done_recv_signal( + self, request_id: str, remote_host: str, + remote_handshake_port: int, + remote_port_send_num: dict[int, RemotePortInfo]): logger.debug("Sending done recving signal for request %s to %s:%d", request_id, remote_host, remote_handshake_port) sock: Optional[zmq.Socket] = None # type: ignore @@ -1177,7 +1184,7 @@ class MooncakeConnectorWorker: self.tp_num_need_pulls = num_d_block_heads // num_p_block_heads self.local_remote_block_port_mapping: dict[ str, Optional[List[List[int]]]] = {} - self.remote_port_send_num: dict[str, dict[int, dict[str, int | str]]] = {} + self.remote_port_send_num: dict[str, dict[int, RemotePortInfo]] = {} def _get_prefill_decode_size(self, vllm_config: VllmConfig): # get prefill tp and dp size from extra config @@ -1463,16 +1470,20 @@ class MooncakeConnectorWorker: return local_remote_block_port_mappings - def get_remote_port_send_num(local_remote_block_port_mappings): - remote_port_send_num: dict[int, dict[str, int | str]] = {} + def get_remote_port_send_num( + local_remote_block_port_mappings: dict[int, list[list[int]]] + ) -> dict[int, RemotePortInfo]: + remote_port_send_num: dict[int, RemotePortInfo] = {} for port in range(self._prefill_tp_size * meta.remote_pcp_size): - remote_host = meta.remote_multi_nodes_meta_mapping[str(port)]['host'] - remote_port_send_num[meta.remote_port + port] = {} - remote_port_send_num[meta.remote_port + port]['num'] = 0 - remote_port_send_num[meta.remote_port + port]['host'] = remote_host - for local_port in local_remote_block_port_mappings.keys(): - remote_port_head_list = local_remote_block_port_mappings[ - local_port] + remote_host = str(meta.remote_multi_nodes_meta_mapping[str( + port)]['host']) + remote_port_send_num[meta.remote_port + port] = { + 'num': 0, + 'host': remote_host + } + + for remote_port_head_list in local_remote_block_port_mappings.values( + ): for remote_port_list in remote_port_head_list: for remote_port in remote_port_list: remote_port_send_num[remote_port]['num'] += 1 diff --git a/vllm_ascend/xlite/xlite.py b/vllm_ascend/xlite/xlite.py index f3007a6f..aa21a944 100644 --- a/vllm_ascend/xlite/xlite.py +++ b/vllm_ascend/xlite/xlite.py @@ -25,7 +25,7 @@ from vllm.distributed import (get_ep_group, from vllm.forward_context import get_forward_context from vllm.logger import logger from vllm.sequence import IntermediateTensors -from xlite._C import (AttnMHA, Model, ModelAttnMeta, ModelConfig, Runtime, +from xlite._C import (AttnMHA, Model, ModelAttnMeta, ModelConfig, Runtime, # type: ignore[attr-defined] ScoringFuncSoftmax) import vllm_ascend.envs as envs_ascend @@ -214,10 +214,10 @@ class QwenMoeXliteModel(LlamaXliteModel): config.def_dp_size = vllm_config.parallel_config.data_parallel_size config.moe_ep_size = ep_group.world_size if vllm_config.parallel_config.enable_expert_parallel else 1 config.moe_tp_size = 1 if vllm_config.parallel_config.enable_expert_parallel else ep_group.world_size - config.experts_weight_transpose = True + config.experts_weight_transpose = True # type: ignore config.moe_intermediate_size = hf_config.moe_intermediate_size - config.norm_topk_prob = hf_config.norm_topk_prob - config.scoring_func = ScoringFuncSoftmax + config.norm_topk_prob = hf_config.norm_topk_prob # type: ignore + config.scoring_func = ScoringFuncSoftmax # type: ignore return config def _build_model(self, runnable: nn.Module, vllm_config: VllmConfig,