[CI] optimize lint term (#5986)

### What this PR does / why we need it? This patch purpose to optimize the lint check term. The main idea is to reduce unnecessary installation time. 1. The installation of vllm is not must, only append the path of vllm src to the `PATHONPATH` is effective 2. This installation of `requirements-dev.txt` is not must, we have a pre-built image `quay.io/ascend-ci/vllm-ascend:lint` with all the requirements installed in advance. **NOTE**: the conditions for triggering image builds are: 1).Daily scheduled build; 2) Build when requirements are modified; 3) Manual build. This ensures that the dependencies in our image are up-to-date to the greatest extent possible. 3. The `mypy` was separated from the `pre-commit` hook for performance reasons; we found that integrating `mypy` into the `pre-commit` hook resulted in poor performance. 4. Reduce the CPU core consumption from 16 -> 8 ### Does this PR introduce _any_ user-facing change? The end-to-end lint time was optimized from 20min/per PR to 8min/per PR ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: 2c24bc6996 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2026-01-22 15:46:59 +08:00
parent 9bba0a2a68
commit 484e7c59dc
10 changed files with 196 additions and 77 deletions
--- a/vllm_ascend/_310p/attention/attention_v1.py
+++ b/vllm_ascend/_310p/attention/attention_v1.py
@@ -15,6 +15,7 @@
 # This file is a part of the vllm-ascend project.
 #

+from typing import Any

 import torch
 import torch_npu
@@ -23,7 +24,7 @@ from vllm_ascend._310p.attention.attention_mask import AttentionMaskBuilder, bui
 from vllm_ascend._310p.attention.metadata_builder import AscendAttentionMetadataBuilder310P
 from vllm_ascend.attention.attention_v1 import AscendAttentionBackend as _BaseBackend
 from vllm_ascend.attention.attention_v1 import AscendAttentionBackendImpl as _BaseImpl
-from vllm_ascend.attention.attention_v1 import AscendAttentionMetadataBuilder, AscendAttentionState
+from vllm_ascend.attention.attention_v1 import AscendAttentionMetadataBuilder, AscendAttentionState, AscendMetadata
 from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, aligned_16, nd_to_nz_2d


@@ -47,9 +48,17 @@ class AscendAttentionBackend310(_BaseBackend):


 class AscendAttentionBackendImpl310(_BaseImpl):
-    def forward_paged_attention(self, query, attn_metadata, output):
+    def forward_paged_attention(
+        self,
+        query: Any,
+        attn_metadata: AscendMetadata,
+        output: Any | None = None,
+    ) -> Any:
        if attn_metadata.seq_lens.device != query.device:
-            attn_metadata.seq_lens = attn_metadata.seq_lens.to(device=query.device, non_blocking=True)
+            attn_metadata.seq_lens = attn_metadata.seq_lens.to(
+                device=query.device,
+                non_blocking=True,
+            )
        return super().forward_paged_attention(query, attn_metadata, output)

    def _forward_prefill_310p_fallback(self, query, key, value, attn_metadata, output):
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -46,9 +46,7 @@ from vllm_ascend.device.device_op import DeviceOperator
 from vllm_ascend.ops.flashcomm2_oshard_manager import flashcomm2_oshard_manager
 from vllm_ascend.utils import vllm_version_is, weak_ref_tensors

-# isort: off
 if vllm_version_is("0.13.0"):
-    from vllm.v1.attention.backends.utils import AttentionCGSupport, AttentionMetadataBuilder
    from vllm.attention.backends.abstract import (  # type: ignore
        AttentionBackend,
        AttentionImpl,
@@ -59,20 +57,21 @@ if vllm_version_is("0.13.0"):
        AttentionBackendEnum,
        register_backend,
    )
+    from vllm.v1.attention.backends.utils import AttentionCGSupport, AttentionMetadataBuilder
 else:
    from vllm.v1.attention.backend import (  # type: ignore
        AttentionBackend,
        AttentionCGSupport,
        AttentionImpl,
        AttentionLayer,
-        AttentionType,
        AttentionMetadataBuilder,
+        AttentionType,
    )
    from vllm.v1.attention.backends.registry import (  # type: ignore
        AttentionBackendEnum,
        register_backend,
    )
-# isort: on
+

 # default max value of sliding window size
 SWA_INT_MAX = 2147483647
--- a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py
+++ b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py
@@ -13,7 +13,7 @@ from collections import defaultdict, deque
 from collections.abc import Iterator
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, List, Optional, OrderedDict, Tuple
+from typing import TYPE_CHECKING, Any, List, Optional, OrderedDict, Tuple, TypedDict

 import msgspec
 import numpy as np
@@ -60,6 +60,11 @@ GET_META_MSG = b"get_meta_msg"
 DONE_RECVING_MSG = b"done_recving_msg"


+class RemotePortInfo(TypedDict):
+    num: int
+    host: str
+
+
 class MooncakeAgentMetadata(msgspec.Struct, omit_defaults=True, dict=True):
    engine_id: str
    te_rpc_port: int
@@ -384,7 +389,7 @@ class KVCacheRecvingThread(threading.Thread):
                    remote_handshake_port: int,
                    offset: int,
                    tp_num_need_pulls: int,
-                    remote_port_send_num: dict[int, dict[str, int | str]] = {},
+                    remote_port_send_num: dict[int, RemotePortInfo] = {},
                    all_task_done: bool = False):
        """Add a new request to the queue for processing."""
        logger.debug(f"Adding request {request_id} to the queue.")
@@ -458,8 +463,9 @@ class KVCacheRecvingThread(threading.Thread):
            self._send_done_signal_to_free_remote_port(remote_request_id, remote_host,
                                                       remote_port_send_num)

-    def _send_done_signal_to_free_remote_port(self, request_id, remote_host,
-                                              remote_port_send_num):
+    def _send_done_signal_to_free_remote_port(
+            self, request_id: str, remote_host: str,
+            remote_port_send_num: dict[int, RemotePortInfo]):
        if self.side_channel_port != self.local_handshake_port \
            or not remote_port_send_num:
            return
@@ -708,9 +714,10 @@ class KVCacheRecvingThread(threading.Thread):
                logger.debug("Returned socket to pool for %s:%d", remote_host,
                             remote_handshake_port)

-    def _send_done_recv_signal(self, request_id: str, remote_host: str,
-                               remote_handshake_port: int,
-                               remote_port_send_num: dict[int, dict[str, int | str]]):
+    def _send_done_recv_signal(
+            self, request_id: str, remote_host: str,
+            remote_handshake_port: int,
+            remote_port_send_num: dict[int, RemotePortInfo]):
        logger.debug("Sending done recving signal for request %s to %s:%d",
                     request_id, remote_host, remote_handshake_port)
        sock: Optional[zmq.Socket] = None  # type: ignore
@@ -1177,7 +1184,7 @@ class MooncakeConnectorWorker:
            self.tp_num_need_pulls = num_d_block_heads // num_p_block_heads
        self.local_remote_block_port_mapping: dict[
            str, Optional[List[List[int]]]] = {}
-        self.remote_port_send_num: dict[str, dict[int, dict[str, int | str]]] = {}
+        self.remote_port_send_num: dict[str, dict[int, RemotePortInfo]] = {}

    def _get_prefill_decode_size(self, vllm_config: VllmConfig):
        # get prefill tp and dp size from extra config
@@ -1463,16 +1470,20 @@ class MooncakeConnectorWorker:

            return local_remote_block_port_mappings

-        def get_remote_port_send_num(local_remote_block_port_mappings):
-            remote_port_send_num: dict[int, dict[str, int | str]] = {}
+        def get_remote_port_send_num(
+                local_remote_block_port_mappings: dict[int, list[list[int]]]
+        ) -> dict[int, RemotePortInfo]:
+            remote_port_send_num: dict[int, RemotePortInfo] = {}
            for port in range(self._prefill_tp_size * meta.remote_pcp_size):
-                remote_host = meta.remote_multi_nodes_meta_mapping[str(port)]['host']
-                remote_port_send_num[meta.remote_port + port] = {}
-                remote_port_send_num[meta.remote_port + port]['num'] = 0
-                remote_port_send_num[meta.remote_port + port]['host'] = remote_host
-            for local_port in local_remote_block_port_mappings.keys():
-                remote_port_head_list = local_remote_block_port_mappings[
-                    local_port]
+                remote_host = str(meta.remote_multi_nodes_meta_mapping[str(
+                    port)]['host'])
+                remote_port_send_num[meta.remote_port + port] = {
+                    'num': 0,
+                    'host': remote_host
+                }
+
+            for remote_port_head_list in local_remote_block_port_mappings.values(
+            ):
                for remote_port_list in remote_port_head_list:
                    for remote_port in remote_port_list:
                        remote_port_send_num[remote_port]['num'] += 1
--- a/vllm_ascend/xlite/xlite.py
+++ b/vllm_ascend/xlite/xlite.py
@@ -25,7 +25,7 @@ from vllm.distributed import (get_ep_group,
 from vllm.forward_context import get_forward_context
 from vllm.logger import logger
 from vllm.sequence import IntermediateTensors
-from xlite._C import (AttnMHA, Model, ModelAttnMeta, ModelConfig, Runtime,
+from xlite._C import (AttnMHA, Model, ModelAttnMeta, ModelConfig, Runtime, # type: ignore[attr-defined]
                      ScoringFuncSoftmax)

 import vllm_ascend.envs as envs_ascend
@@ -214,10 +214,10 @@ class QwenMoeXliteModel(LlamaXliteModel):
        config.def_dp_size = vllm_config.parallel_config.data_parallel_size
        config.moe_ep_size = ep_group.world_size if vllm_config.parallel_config.enable_expert_parallel else 1
        config.moe_tp_size = 1 if vllm_config.parallel_config.enable_expert_parallel else ep_group.world_size
-        config.experts_weight_transpose = True
+        config.experts_weight_transpose = True # type: ignore
        config.moe_intermediate_size = hf_config.moe_intermediate_size
-        config.norm_topk_prob = hf_config.norm_topk_prob
-        config.scoring_func = ScoringFuncSoftmax
+        config.norm_topk_prob = hf_config.norm_topk_prob # type: ignore
+        config.scoring_func = ScoringFuncSoftmax # type: ignore
        return config

    def _build_model(self, runnable: nn.Module, vllm_config: VllmConfig,