[Lint]Style: Convert vllm-ascend/ to ruff format(new Batch #8) (#6604)

### What this PR does / why we need it? **Scope of Changes**: | File Path | | :--- | | vllm_ascend/ops/\_\_init\_\_.py | | vllm_ascend/ops/activation.py | | vllm_ascend/ops/flashcomm2_oshard_manager.py | | vllm_ascend/ops/layernorm.py | | vllm_ascend/ops/mla.py | | vllm_ascend/ops/mm_encoder_attention.py | | vllm_ascend/ops/register_custom_ops.py | | vllm_ascend/ops/vocab_parallel_embedding.py | | vllm_ascend/ops/weight_prefetch.py | | vllm_ascend/spec_decode/\_\_init\_\_.py | | vllm_ascend/spec_decode/eagle_proposer.py | | vllm_ascend/spec_decode/interface.py | | vllm_ascend/spec_decode/mtp_proposer.py | | vllm_ascend/spec_decode/ngram_proposer.py | | vllm_ascend/spec_decode/suffix_proposer.py | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: d7e17aaacd Signed-off-by: MrZ20 <2609716663@qq.com>
2026-02-07 09:16:07 +08:00
parent c63b7a1188
commit 06aa6036f6
17 changed files with 947 additions and 1148 deletions
--- a/vllm_ascend/ops/mm_encoder_attention.py
+++ b/vllm_ascend/ops/mm_encoder_attention.py
@@ -19,18 +19,15 @@ import einops
 import torch
 import torch.nn.functional as F
 import torch_npu
-from vllm.config import MultiModalConfig
 from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention  # type: ignore

 import vllm_ascend.envs as envs_ascend

-
 MIN_PAD_SIZE = 64  # min_size to pad weight
 MAX_PAD_SIZE = 128  # max_size to pad weight


 class AscendMMEncoderAttention(MMEncoderAttention):
-
    def __init__(
        self,
        num_heads: int,
@@ -82,13 +79,12 @@ class AscendMMEncoderAttention(MMEncoderAttention):
        return query, key, value

    def forward_oot(
-            self,
-            query: torch.Tensor,
-            key: torch.Tensor,
-            value: torch.Tensor,
-            cu_seqlens: torch.Tensor | None = None,
-            max_seqlen: torch.Tensor
-        | None = None,  # Only used for Flash Attention
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        cu_seqlens: torch.Tensor | None = None,
+        max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
    ):
        bsz, q_len = query.size()[:2]
        kv_len = key.size(1)
@@ -97,9 +93,7 @@ class AscendMMEncoderAttention(MMEncoderAttention):
        # q, k, v: [b, s, head, head_dim] -> [b * s, head, head_dim]
        q, k, v = self.reshape_qkv_to_3d(query, key, value, bsz, q_len, kv_len)

-        enable_pad = (envs_ascend.USE_OPTIMIZED_MODEL
-                      and self.head_size > MIN_PAD_SIZE
-                      and self.head_size < MAX_PAD_SIZE)
+        enable_pad = envs_ascend.USE_OPTIMIZED_MODEL and self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE

        if enable_pad:
            origin_shape = q.shape[-1]
@@ -114,10 +108,7 @@ class AscendMMEncoderAttention(MMEncoderAttention):
        context_layer = torch.empty_like(q)

        if cu_seqlens is None:
-            cu_seqlens = torch.arange(0, (bsz + 1) * q_len,
-                                      step=q_len,
-                                      dtype=torch.int32,
-                                      device=query.device)
+            cu_seqlens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=query.device)

        cu_seqlens = torch.diff(cu_seqlens).to("cpu")

@@ -137,11 +128,7 @@ class AscendMMEncoderAttention(MMEncoderAttention):
            context_layer = context_layer[..., :origin_shape]

        if is_reshaped:
-            context_layer = einops.rearrange(context_layer,
-                                             "(b s) h d -> b s h d",
-                                             b=bsz).contiguous()
+            context_layer = einops.rearrange(context_layer, "(b s) h d -> b s h d", b=bsz).contiguous()
        else:
-            context_layer = einops.rearrange(context_layer,
-                                             "(b s) h d -> b s (h d)",
-                                             b=bsz).contiguous()
+            context_layer = einops.rearrange(context_layer, "(b s) h d -> b s (h d)", b=bsz).contiguous()
        return context_layer