[Lint]Style: Convert vllm-ascend/ to ruff format(Batch #7) (#6023)

### What this PR does / why we need it? **Scope of Changes**: | File Path | | :--- | |` vllm_ascend/quantization/compressed_tensors/compressed_tensors.py`| |` vllm_ascend/quantization/quant_config.py`| |` vllm_ascend/quantization/utils.py`| |` vllm_ascend/quantization/w4a16.py`| |` vllm_ascend/quantization/w4a4_flatquant_dynamic.py`| |` vllm_ascend/quantization/w4a8_dynamic.py`| |` vllm_ascend/quantization/w8a16.py`| |` vllm_ascend/quantization/w8a8.py`| |` vllm_ascend/quantization/w8a8_dynamic.py`| |` vllm_ascend/quantization/w8a8_pdmix.py`| |` vllm_ascend/quantization/w8a8mxfp8.py`| |` vllm_ascend/sample/rejection_sampler.py`| |` vllm_ascend/sample/sampler.py`| |` vllm_ascend/worker/block_table.py`| ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: 2c24bc6996 Signed-off-by: MrZ20 <2609716663@qq.com>
2026-02-06 14:56:53 +08:00
parent d0bc16859c
commit 99aedaff63
20 changed files with 997 additions and 1307 deletions
--- a/vllm_ascend/quantization/methods/w4a4_flatquant.py
+++ b/vllm_ascend/quantization/methods/w4a4_flatquant.py
@@ -16,7 +16,7 @@
 #

 import math
-from typing import Any, Dict, Optional, Tuple
+from typing import Any

 import torch
 import torch_npu
@@ -31,8 +31,7 @@ def pack_int4_weights(weight_tensor: torch.Tensor) -> torch.Tensor:
    """Pack int4 weights for NPU."""
    original_device = weight_tensor.device
    weight_tensor_npu = weight_tensor.npu()
-    weight_int4_packed = torch_npu.npu_convert_weight_to_int4pack(
-        weight_tensor_npu.to(torch.int32), inner_k_tiles=1)
+    weight_int4_packed = torch_npu.npu_convert_weight_to_int4pack(weight_tensor_npu.to(torch.int32), inner_k_tiles=1)
    return weight_int4_packed.to(original_device)


@@ -58,22 +57,14 @@ def batched_kronecker_quant(
    left_trans: torch.Tensor,
    right_trans: torch.Tensor,
    clip_ratio: float,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
    """Batched Kronecker quantization with batch size limit handling."""
    batch_tokens = x.shape[0]
    if batch_tokens <= KRONECKER_QUANT_MAX_BATCH_SIZE:
-        return torch_npu.npu_kronecker_quant(x,
-                                             left_trans,
-                                             right_trans,
-                                             clip_ratio=clip_ratio,
-                                             dst_dtype=torch.int32)
+        return torch_npu.npu_kronecker_quant(x, left_trans, right_trans, clip_ratio=clip_ratio, dst_dtype=torch.int32)
    x_chunks = torch.split(x, KRONECKER_QUANT_MAX_BATCH_SIZE, dim=0)
    processed_chunks = [
-        torch_npu.npu_kronecker_quant(chunk,
-                                      left_trans,
-                                      right_trans,
-                                      clip_ratio=clip_ratio,
-                                      dst_dtype=torch.int32)
+        torch_npu.npu_kronecker_quant(chunk, left_trans, right_trans, clip_ratio=clip_ratio, dst_dtype=torch.int32)
        for chunk in x_chunks
    ]
    quantized_list, scale_list = zip(*processed_chunks)
@@ -85,39 +76,32 @@ def batched_kronecker_quant(
@register_scheme("W4A4_FLATQUANT_DYNAMIC", "linear")
 class AscendW4A4FlatQuantDynamicLinearMethod(AscendLinearScheme):
    """Linear method for Ascend W4A4_FLATQUANT_DYNAMIC.
-    
+
    This class implements W4A4 quantization with FlatQuant approach and dynamic activation quantization.
    - Weight: 4-bit quantization (per-channel) with scale and offset, stored as int8 and packed to int32 during loading
-    - Activation: 4-bit dynamic quantization with FlatQuant transform matrices (left_trans, right_trans) for distribution smoothing
-    - Parameters: clip_ratio for controlling quantization clipping, weight_offset for asymmetric quantization, loaded from external weights
+    - Activation: 4-bit dynamic quantization with FlatQuant transform matrices (left_trans, right_trans) for
+      distribution smoothing
+    - Parameters: clip_ratio for controlling quantization clipping, weight_offset for asymmetric quantization, loaded
+      from external weights
    """
+
    input_size = 0

    def __init__(self):
        self.sym = True

-    def get_weight(self, input_size: int, output_size: int,
-                   params_dtype: torch.dtype) -> Dict[str, Any]:
+    def get_weight(self, input_size: int, output_size: int, params_dtype: torch.dtype) -> dict[str, Any]:
        if input_size % 8 != 0:
-            raise ValueError(
-                f"input_size ({input_size}) must be divisible by 8 for int4 packing"
-            )
+            raise ValueError(f"input_size ({input_size}) must be divisible by 8 for int4 packing")
        AscendW4A4FlatQuantDynamicLinearMethod.input_size = input_size
-        params_dict = {
-            "weight": torch.empty(output_size, input_size, dtype=torch.int8)
-        }
+        params_dict = {"weight": torch.empty(output_size, input_size, dtype=torch.int8)}
        return params_dict

-    def get_pertensor_param(self, params_dtype: torch.dtype) -> Dict[str, Any]:
+    def get_pertensor_param(self, params_dtype: torch.dtype) -> dict[str, Any]:
        params_dict = {}
-        left_trans_dim, right_trans_dim = get_decompose_dim(
-            AscendW4A4FlatQuantDynamicLinearMethod.input_size)
-        params_dict["left_trans"] = torch.empty(left_trans_dim,
-                                                left_trans_dim,
-                                                dtype=params_dtype)
-        params_dict["right_trans"] = torch.empty(right_trans_dim,
-                                                 right_trans_dim,
-                                                 dtype=params_dtype)
+        left_trans_dim, right_trans_dim = get_decompose_dim(AscendW4A4FlatQuantDynamicLinearMethod.input_size)
+        params_dict["left_trans"] = torch.empty(left_trans_dim, left_trans_dim, dtype=params_dtype)
+        params_dict["right_trans"] = torch.empty(right_trans_dim, right_trans_dim, dtype=params_dtype)
        params_dict["clip_ratio"] = torch.empty(1, dtype=torch.float32)
        return params_dict

@@ -125,22 +109,18 @@ class AscendW4A4FlatQuantDynamicLinearMethod(AscendLinearScheme):
        self,
        output_size: int,
        params_dtype: torch.dtype,
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
        params_dict = {}
-        params_dict["weight_scale"] = torch.empty(output_size,
-                                                  1,
-                                                  dtype=torch.float32)
-        params_dict["weight_offset"] = torch.empty(output_size,
-                                                   1,
-                                                   dtype=torch.float32)
+        params_dict["weight_scale"] = torch.empty(output_size, 1, dtype=torch.float32)
+        params_dict["weight_offset"] = torch.empty(output_size, 1, dtype=torch.float32)
        return params_dict

    def apply(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-        tp_rank: Optional[int] = 0,
+        bias: torch.Tensor | None = None,
+        tp_rank: int | None = 0,
    ) -> torch.Tensor:
        original_dtype = x.dtype
        input_shape = x.shape
@@ -156,18 +136,18 @@ class AscendW4A4FlatQuantDynamicLinearMethod(AscendLinearScheme):
        right_trans_matched = layer.right_trans.to(original_dtype)
        x_reshaped = x.view(-1, left_dim, right_dim)
        x_quantized_int4, activation_scale = batched_kronecker_quant(
-            x_reshaped, left_trans_matched, right_trans_matched,
-            layer.aclnn_clip_ratio)
-        x_quantized_reshaped = x_quantized_int4.view(-1,
-                                                     left_dim * right_dim // 8)
+            x_reshaped, left_trans_matched, right_trans_matched, layer.aclnn_clip_ratio
+        )
+        x_quantized_reshaped = x_quantized_int4.view(-1, left_dim * right_dim // 8)
        pertoken_scale = activation_scale.view(-1).to(torch.float32)
-        output = torch_npu.npu_quant_matmul(x_quantized_reshaped,
-                                            layer.weight_packed.t(),
-                                            layer.weight_scale.view(-1).to(
-                                                torch.float32),
-                                            pertoken_scale=pertoken_scale,
-                                            bias=None,
-                                            output_dtype=original_dtype)
+        output = torch_npu.npu_quant_matmul(
+            x_quantized_reshaped,
+            layer.weight_packed.t(),
+            layer.weight_scale.view(-1).to(torch.float32),
+            pertoken_scale=pertoken_scale,
+            bias=None,
+            output_dtype=original_dtype,
+        )
        output = output.view(*input_shape[:-1], -1)
        if bias is not None:
            output = output + bias.to(original_dtype)
@@ -176,15 +156,11 @@ class AscendW4A4FlatQuantDynamicLinearMethod(AscendLinearScheme):
    def process_weights_after_loading(self, layer):
        # NOTE: Currently, w4a4 can't support weight nz
        weight_packed = pack_int4_weights(layer.weight.data)
-        layer.register_parameter(
-            'weight_packed',
-            torch.nn.Parameter(weight_packed, requires_grad=False))
+        layer.register_parameter("weight_packed", torch.nn.Parameter(weight_packed, requires_grad=False))
        del layer.weight
        layer.weight_scale.data = layer.weight_scale.data.to(torch.float32)
        layer.weight_offset.data = layer.weight_offset.data.to(torch.float32)
-        layer.left_trans = torch.nn.Parameter(
-            layer.left_trans.data.t().contiguous())
+        layer.left_trans = torch.nn.Parameter(layer.left_trans.data.t().contiguous())
        layer.right_trans = torch.nn.Parameter(layer.right_trans.data)
-        layer.clip_ratio = torch.nn.Parameter(
-            layer.clip_ratio.data.to(torch.float32))
+        layer.clip_ratio = torch.nn.Parameter(layer.clip_ratio.data.to(torch.float32))
        layer.aclnn_clip_ratio = layer.clip_ratio.item()