[Lint]Style: Convert vllm-ascend/ to ruff format(Batch #10) (#6173)

### What this PR does / why we need it? **Scope of Changes**: | File Path | | :--- | |`vllm_ascend/ops/layer_shard_linear.py`| |`vllm_ascend/ops/linear.py`| |`vllm_ascend/ops/linear_op.py`| |`vllm_ascend/worker/worker.py`| | ` vllm_ascend/patch/worker/patch_bert.py` | | ` vllm_ascend/patch/worker/patch_deepseek.py` | | ` vllm_ascend/patch/worker/patch_distributed.py` | | ` vllm_ascend/patch/worker/patch_module.py` | | ` vllm_ascend/patch/worker/patch_multimodal_merge.py` | | ` vllm_ascend/patch/worker/patch_qwen3_next.py` | | ` vllm_ascend/patch/worker/patch_qwen3_next_mtp.py` | | ` vllm_ascend/patch/worker/patch_rejection_sampler.py` | | ` vllm_ascend/patch/worker/patch_rope.py` | | ` vllm_ascend/patch/worker/patch_triton.py` | | ` vllm_ascend/patch/worker/patch_unquantized_gemm.py` | | ` vllm_ascend/patch/worker/patch_v2_egale.py` | |` vllm_ascend/worker/npu_input_batch.py`| |` vllm_ascend/worker/v2/aclgraph_utils.py`| |` vllm_ascend/worker/v2/attn_utils.py`| |` vllm_ascend/worker/v2/model_runner.py`| |` vllm_ascend/worker/v2/sample/gumbel.py`| |` vllm_ascend/worker/v2/sample/penalties.py`| |` vllm_ascend/worker/v2/sample/sampler.py`| |` vllm_ascend/worker/v2/spec_decode/__init__.py`| |` vllm_ascend/worker/v2/spec_decode/eagle.py`| |` vllm_ascend/worker/v2/states.py`| ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.14.0 - vLLM main: d68209402d Signed-off-by: MrZ20 <2609716663@qq.com> Signed-off-by: SILONG ZENG <2609716663@qq.com> Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-02-06 15:35:06 +08:00
parent 65b7f716e6
commit 19b5d44ea8
33 changed files with 938 additions and 1243 deletions
--- a/vllm_ascend/ops/linear.py
+++ b/vllm_ascend/ops/linear.py
@@ -20,19 +20,23 @@ AscendMergedColumnParallelLinear, AscendMergedColumnParallelLinear,
 AscendRowParallelLinear and AscendColumnParallelLinear.
 """

-from typing import Optional, Union
-
 import torch
 import torch.nn as nn
 from torch.nn.parameter import Parameter
 from vllm.config import get_current_vllm_config
 from vllm.distributed import divide
 from vllm.model_executor.layers.linear import (  # noqa
-    WEIGHT_LOADER_V2_SUPPORTED, ColumnParallelLinear, LinearBase,
-    MergedColumnParallelLinear, QKVParallelLinear, QuantizeMethodBase,
-    ReplicatedLinear, RowParallelLinear, UnquantizedLinearMethod)
-from vllm.model_executor.layers.quantization.base_config import \
-    QuantizationConfig
+    WEIGHT_LOADER_V2_SUPPORTED,
+    ColumnParallelLinear,
+    LinearBase,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    QuantizeMethodBase,
+    ReplicatedLinear,
+    RowParallelLinear,
+    UnquantizedLinearMethod,
+)
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.utils import set_weight_attrs

 from vllm_ascend.ops.linear_op import get_parallel_op, get_replicated_op
@@ -50,14 +54,13 @@ class AscendUnquantizedLinearMethod(UnquantizedLinearMethod):

 # TODO(realliujiaxu): Remove this class after linear of vllm supports custom comm group
 class AscendLinearBase(LinearBase):
-
    def __init__(
        self,
        input_size: int,
        output_size: int,
        skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
        *,
        return_bias: bool = True,
@@ -75,11 +78,9 @@ class AscendLinearBase(LinearBase):
        self.quant_config = quant_config
        self.prefix = prefix
        if quant_config is None:
-            self.quant_method: Optional[
-                QuantizeMethodBase] = AscendUnquantizedLinearMethod()
+            self.quant_method: QuantizeMethodBase | None = AscendUnquantizedLinearMethod()
        else:
-            self.quant_method = quant_config.get_quant_method(self,
-                                                              prefix=prefix)
+            self.quant_method = quant_config.get_quant_method(self, prefix=prefix)
        self.return_bias = return_bias
        self.disable_tp = disable_tp

@@ -100,11 +101,11 @@ class AscendQKVParallelLinear(QKVParallelLinear):
        hidden_size: int,
        head_size: int,
        total_num_heads: int,
-        total_num_kv_heads: Optional[int] = None,
+        total_num_kv_heads: int | None = None,
        bias: bool = True,
        skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
        *,
        return_bias: bool = True,
@@ -112,9 +113,9 @@ class AscendQKVParallelLinear(QKVParallelLinear):
        v_head_size: int | None = None,
    ):
        self.v_head_size = v_head_size if v_head_size is not None else head_size
-        self.custom_op, _, tp_size = get_parallel_op(disable_tp, prefix, self,
-                                                     "column")
-        # TODO(realliujiaxu): Replace the initialization code below with super().__init__ after linear of vllm supports custom comm group
+        self.custom_op, _, tp_size = get_parallel_op(disable_tp, prefix, self, "column")
+        # TODO(realliujiaxu): Replace the initialization code below with super().__init__ after
+        # linear of vllm supports custom comm group
        self.hidden_size = hidden_size
        self.head_size = head_size
        self.total_num_heads = total_num_heads
@@ -125,35 +126,35 @@ class AscendQKVParallelLinear(QKVParallelLinear):
        self.num_heads = divide(self.total_num_heads, tp_size)
        if tp_size >= self.total_num_kv_heads:
            self.num_kv_heads = 1
-            self.num_kv_head_replicas = divide(tp_size,
-                                               self.total_num_kv_heads)
+            self.num_kv_head_replicas = divide(tp_size, self.total_num_kv_heads)
        else:
            self.num_kv_heads = divide(self.total_num_kv_heads, tp_size)
            self.num_kv_head_replicas = 1
        input_size = self.hidden_size
-        output_size = (self.num_heads +
-                       2 * self.num_kv_heads) * tp_size * self.head_size
+        output_size = (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size
        self.output_sizes = [
            self.num_heads * self.head_size * tp_size,  # q_proj
            self.num_kv_heads * self.head_size * tp_size,  # k_proj
            self.num_kv_heads * self.head_size * tp_size,  # v_proj
        ]
-        AscendColumnParallelLinear.__init__(self,
-                                            input_size=input_size,
-                                            output_size=output_size,
-                                            bias=bias,
-                                            gather_output=False,
-                                            skip_bias_add=skip_bias_add,
-                                            params_dtype=params_dtype,
-                                            quant_config=quant_config,
-                                            prefix=prefix,
-                                            return_bias=return_bias,
-                                            disable_tp=disable_tp)
+        AscendColumnParallelLinear.__init__(
+            self,
+            input_size=input_size,
+            output_size=output_size,
+            bias=bias,
+            gather_output=False,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            prefix=prefix,
+            return_bias=return_bias,
+            disable_tp=disable_tp,
+        )

    def forward(
        self,
        input_,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
        if self.custom_op is not None:
            return self.custom_op.apply(input_)

@@ -178,35 +179,36 @@ class AscendMergedColumnParallelLinear(MergedColumnParallelLinear):
        bias: bool = True,
        gather_output: bool = False,
        skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
        *,
        return_bias: bool = True,
        disable_tp: bool = False,
    ):
-        self.custom_op, self.tp_rank, self.tp_size = get_parallel_op(
-            disable_tp, prefix, self, "column")
-        # TODO(realliujiaxu): Replace the initialization code below with super().__init__ after linear of vllm supports custom comm group
+        self.custom_op, self.tp_rank, self.tp_size = get_parallel_op(disable_tp, prefix, self, "column")
+        # TODO(realliujiaxu): Replace the initialization code below with super().__init__ after
+        # linear of vllm supports custom comm group
        self.output_sizes = output_sizes
-        assert all(output_size % self.tp_size == 0
-                   for output_size in output_sizes)
-        AscendColumnParallelLinear.__init__(self,
-                                            input_size=input_size,
-                                            output_size=sum(output_sizes),
-                                            bias=bias,
-                                            gather_output=gather_output,
-                                            skip_bias_add=skip_bias_add,
-                                            params_dtype=params_dtype,
-                                            quant_config=quant_config,
-                                            prefix=prefix,
-                                            return_bias=return_bias,
-                                            disable_tp=disable_tp)
+        assert all(output_size % self.tp_size == 0 for output_size in output_sizes)
+        AscendColumnParallelLinear.__init__(
+            self,
+            input_size=input_size,
+            output_size=sum(output_sizes),
+            bias=bias,
+            gather_output=gather_output,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            prefix=prefix,
+            return_bias=return_bias,
+            disable_tp=disable_tp,
+        )

    def forward(
        self,
        input_,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
        if self.custom_op is not None:
            return self.custom_op.apply(input_)

@@ -229,9 +231,9 @@ class AscendRowParallelLinear(RowParallelLinear):
        bias: bool = True,
        input_is_parallel: bool = True,
        skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
+        params_dtype: torch.dtype | None = None,
        reduce_results: bool = True,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
        *,
        return_bias: bool = True,
@@ -247,23 +249,25 @@ class AscendRowParallelLinear(RowParallelLinear):
            self.unique_prefix = unique_prefix
            compilation_config.static_forward_context[unique_prefix] = self

-        self.custom_op, self.tp_rank, self.tp_size = get_parallel_op(
-            disable_tp, prefix, self, "row")
-        # TODO(realliujiaxu): Replace the initialization code below with super().__init__ after linear of vllm supports custom comm group
+        self.custom_op, self.tp_rank, self.tp_size = get_parallel_op(disable_tp, prefix, self, "row")
+        # TODO(realliujiaxu): Replace the initialization code below with super().__init__ after
+        # linear of vllm supports custom comm group
        # Divide the weight matrix along the first dimension.
        self.input_size_per_partition = divide(input_size, self.tp_size)
        self.output_size_per_partition = output_size
        self.output_partition_sizes = [output_size]

-        AscendLinearBase.__init__(self,
-                                  input_size,
-                                  output_size,
-                                  skip_bias_add,
-                                  params_dtype,
-                                  quant_config,
-                                  prefix,
-                                  return_bias=return_bias,
-                                  disable_tp=disable_tp)
+        AscendLinearBase.__init__(
+            self,
+            input_size,
+            output_size,
+            skip_bias_add,
+            params_dtype,
+            quant_config,
+            prefix,
+            return_bias=return_bias,
+            disable_tp=disable_tp,
+        )

        self.input_is_parallel = input_is_parallel
        self.reduce_results = reduce_results
@@ -277,19 +281,23 @@ class AscendRowParallelLinear(RowParallelLinear):
            output_size=self.output_size,
            params_dtype=self.params_dtype,
            weight_loader=(
-                self.weight_loader_v2 if self.quant_method.__class__.__name__
-                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
+                self.weight_loader_v2
+                if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED
+                else self.weight_loader
+            ),
+        )
        if not reduce_results and (bias and not skip_bias_add):
-            raise ValueError("When not reduce the results, adding bias to the "
-                             "results can lead to incorrect results")
+            raise ValueError("When not reduce the results, adding bias to the results can lead to incorrect results")

        if bias:
-            self.bias = Parameter(
-                torch.empty(self.output_size, dtype=params_dtype))
-            set_weight_attrs(self.bias, {
-                "output_dim": 0,
-                "weight_loader": self.weight_loader,
-            })
+            self.bias = Parameter(torch.empty(self.output_size, dtype=params_dtype))
+            set_weight_attrs(
+                self.bias,
+                {
+                    "output_dim": 0,
+                    "weight_loader": self.weight_loader,
+                },
+            )
        else:
            self.register_parameter("bias", None)

@@ -300,7 +308,7 @@ class AscendRowParallelLinear(RowParallelLinear):
        self,
        input_,
        **kwargs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
        if self.custom_op is not None:
            return self.custom_op.apply(input_)

@@ -321,36 +329,36 @@ class AscendColumnParallelLinear(ColumnParallelLinear):
        bias: bool = True,
        gather_output: bool = False,
        skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        output_sizes: Optional[list[int]] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
+        output_sizes: list[int] | None = None,
        prefix: str = "",
        *,
        return_bias: bool = True,
        disable_tp: bool = False,
    ):
-        self.custom_op, self.tp_rank, self.tp_size = get_parallel_op(
-            disable_tp, prefix, self, "column")
-        # TODO(realliujiaxu): Replace the initialization code below with super().__init__ after linear of vllm supports custom comm group
+        #
+        self.custom_op, self.tp_rank, self.tp_size = get_parallel_op(disable_tp, prefix, self, "column")
+        # TODO(realliujiaxu): Replace the initialization code below with super().__init__ after
+        # linear of vllm supports custom comm group
        self.input_size_per_partition = input_size
        self.output_size_per_partition = divide(output_size, self.tp_size)
        self.output_partition_sizes = [self.output_size_per_partition]
        # If QKV or MergedColumn, use output size of each partition.
        if hasattr(self, "output_sizes"):
-            self.output_partition_sizes = [
-                divide(output_size, self.tp_size)
-                for output_size in self.output_sizes
-            ]
+            self.output_partition_sizes = [divide(output_size, self.tp_size) for output_size in self.output_sizes]

-        AscendLinearBase.__init__(self,
-                                  input_size,
-                                  output_size,
-                                  skip_bias_add,
-                                  params_dtype,
-                                  quant_config,
-                                  prefix,
-                                  return_bias=return_bias,
-                                  disable_tp=disable_tp)
+        AscendLinearBase.__init__(
+            self,
+            input_size,
+            output_size,
+            skip_bias_add,
+            params_dtype,
+            quant_config,
+            prefix,
+            return_bias=return_bias,
+            disable_tp=disable_tp,
+        )

        self.gather_output = gather_output

@@ -366,16 +374,20 @@ class AscendColumnParallelLinear(ColumnParallelLinear):
            output_size=self.output_size,
            params_dtype=self.params_dtype,
            weight_loader=(
-                self.weight_loader_v2 if self.quant_method.__class__.__name__
-                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
+                self.weight_loader_v2
+                if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED
+                else self.weight_loader
+            ),
+        )
        if bias:
-            self.bias = Parameter(
-                torch.empty(self.output_size_per_partition,
-                            dtype=params_dtype))
-            set_weight_attrs(self.bias, {
-                "output_dim": 0,
-                "weight_loader": self.weight_loader,
-            })
+            self.bias = Parameter(torch.empty(self.output_size_per_partition, dtype=params_dtype))
+            set_weight_attrs(
+                self.bias,
+                {
+                    "output_dim": 0,
+                    "weight_loader": self.weight_loader,
+                },
+            )
        else:
            self.register_parameter("bias", None)

@@ -385,7 +397,7 @@ class AscendColumnParallelLinear(ColumnParallelLinear):
    def forward(
        self,
        input_,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
        if self.custom_op is not None:
            return self.custom_op.apply(input_)

@@ -414,8 +426,8 @@ class AscendReplicatedLinear(ReplicatedLinear):
        output_size: int,
        bias: bool = True,
        skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
        *,
        return_bias: bool = True,
@@ -428,32 +440,39 @@ class AscendReplicatedLinear(ReplicatedLinear):
        else:
            self.output_partition_sizes = [output_size]

-        AscendLinearBase.__init__(self,
-                                  input_size,
-                                  output_size,
-                                  skip_bias_add,
-                                  params_dtype,
-                                  quant_config,
-                                  prefix=prefix,
-                                  return_bias=return_bias,
-                                  disable_tp=disable_tp)
+        AscendLinearBase.__init__(
+            self,
+            input_size,
+            output_size,
+            skip_bias_add,
+            params_dtype,
+            quant_config,
+            prefix=prefix,
+            return_bias=return_bias,
+            disable_tp=disable_tp,
+        )

        # All the linear layer supports quant method.
        assert self.quant_method is not None
-        self.quant_method.create_weights(self,
-                                         self.input_size, [self.output_size],
-                                         self.input_size,
-                                         self.output_size,
-                                         self.params_dtype,
-                                         weight_loader=self.weight_loader)
+        self.quant_method.create_weights(
+            self,
+            self.input_size,
+            [self.output_size],
+            self.input_size,
+            self.output_size,
+            self.params_dtype,
+            weight_loader=self.weight_loader,
+        )

        if bias:
-            self.bias = Parameter(
-                torch.empty(self.output_size, dtype=self.params_dtype))
-            set_weight_attrs(self.bias, {
-                "output_dim": 0,
-                "weight_loader": self.weight_loader,
-            })
+            self.bias = Parameter(torch.empty(self.output_size, dtype=self.params_dtype))
+            set_weight_attrs(
+                self.bias,
+                {
+                    "output_dim": 0,
+                    "weight_loader": self.weight_loader,
+                },
+            )
        else:
            self.register_parameter("bias", None)

@@ -463,7 +482,7 @@ class AscendReplicatedLinear(ReplicatedLinear):
    def forward(
        self,
        input_,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
        if self.custom_op is not None:
            return self.custom_op.apply(input_)