[Lint]Style: Convert vllm-ascend/ to ruff format(Batch #7) (#6023)

### What this PR does / why we need it? **Scope of Changes**: | File Path | | :--- | |` vllm_ascend/quantization/compressed_tensors/compressed_tensors.py`| |` vllm_ascend/quantization/quant_config.py`| |` vllm_ascend/quantization/utils.py`| |` vllm_ascend/quantization/w4a16.py`| |` vllm_ascend/quantization/w4a4_flatquant_dynamic.py`| |` vllm_ascend/quantization/w4a8_dynamic.py`| |` vllm_ascend/quantization/w8a16.py`| |` vllm_ascend/quantization/w8a8.py`| |` vllm_ascend/quantization/w8a8_dynamic.py`| |` vllm_ascend/quantization/w8a8_pdmix.py`| |` vllm_ascend/quantization/w8a8mxfp8.py`| |` vllm_ascend/sample/rejection_sampler.py`| |` vllm_ascend/sample/sampler.py`| |` vllm_ascend/worker/block_table.py`| ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: 2c24bc6996 Signed-off-by: MrZ20 <2609716663@qq.com>
2026-02-06 14:56:53 +08:00
parent d0bc16859c
commit 99aedaff63
20 changed files with 997 additions and 1307 deletions
--- a/vllm_ascend/quantization/modelslim_config.py
+++ b/vllm_ascend/quantization/modelslim_config.py
@@ -21,20 +21,18 @@ This module provides the AscendModelSlimConfig class for parsing quantization
 configs generated by the ModelSlim tool, along with model-specific mappings.
 """

+from collections.abc import Mapping
 from types import MappingProxyType
-from typing import Any, Dict, List, Mapping, Optional
+from typing import Any, Optional

 import torch
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import LinearBase
-from vllm.model_executor.layers.quantization import \
-    register_quantization_config
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    UnquantizedEmbeddingMethod, VocabParallelEmbedding)
+from vllm.model_executor.layers.quantization import register_quantization_config
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig, QuantizeMethodBase
+from vllm.model_executor.layers.vocab_parallel_embedding import UnquantizedEmbeddingMethod, VocabParallelEmbedding
 from vllm.model_executor.models.utils import WeightsMapper

 from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD
@@ -45,7 +43,7 @@ logger = init_logger(__name__)

 # key: model_type
 # value: orig_to_new_prefix
-QUANT_MODEL_PREFIX_MAPPINGS: Dict[str, Dict[str, str]] = {
+QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = {
    "qwen3_vl_moe": {
        "visual.": "model.visual.",
        "language_model.lm_head.": "lm_head.",
@@ -60,7 +58,7 @@ QUANT_MODEL_PREFIX_MAPPINGS: Dict[str, Dict[str, str]] = {

 # key: model_type
 # value: dict of fused module name -> list of original module names
-packed_modules_model_mapping: Dict[str, Dict[str, List[str]]] = {
+packed_modules_model_mapping: dict[str, dict[str, list[str]]] = {
    "qwen3_moe": {
        "qkv_proj": [
            "q_proj",
@@ -71,52 +69,44 @@ packed_modules_model_mapping: Dict[str, Dict[str, List[str]]] = {
            "gate_proj",
            "up_proj",
        ],
-        "experts":
-        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
+        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "deepseek_v2": {
        "gate_up_proj": ["gate_proj", "up_proj"],
-        "experts":
-        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
-        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"]
+        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
+        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "deepseek_v3": {
        "gate_up_proj": ["gate_proj", "up_proj"],
-        "experts":
-        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
-        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"]
+        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
+        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "pangu_ultra_moe": {
        "gate_up_proj": ["gate_proj", "up_proj"],
-        "experts":
-        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
-        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"]
+        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
+        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "kimi_k2": {
        "gate_up_proj": ["gate_proj", "up_proj"],
-        "experts":
-        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
-        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"]
+        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
+        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "deepseek_v32": {
        "gate_up_proj": ["gate_proj", "up_proj"],
-        "experts":
-        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
-        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"]
+        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
+        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    # NOTE 1.The quantized MTP layer of deepseek on the NPU is not quantized;
    # NOTE 2.The description file generated by the current msmodelslim tool does not have
    # MTP layer info. Please manually add it and set the value to FLOAT.
    "deepseek_mtp": {
        "gate_up_proj": ["gate_proj", "up_proj"],
-        "experts":
-        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
+        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "pangu_ultra_moe_mtp": {
        "gate_up_proj": ["gate_proj", "up_proj"],
-        "experts":
-        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
-        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"]
+        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
+        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "qwen3_next": {
        "qkv_proj": [
@@ -126,8 +116,7 @@ packed_modules_model_mapping: Dict[str, Dict[str, List[str]]] = {
        ],
        "gate_up_proj": ["gate_proj", "up_proj"],
        "in_proj": ["in_proj_qkvz", "in_proj_ba"],
-        "experts":
-        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
+        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "qwen2_5_vl": {
        "qkv_proj": [
@@ -150,8 +139,7 @@ packed_modules_model_mapping: Dict[str, Dict[str, List[str]]] = {
            "gate_proj",
            "up_proj",
        ],
-        "experts":
-        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
+        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
    "glm4_moe": {
        "qkv_proj": [
@@ -163,20 +151,17 @@ packed_modules_model_mapping: Dict[str, Dict[str, List[str]]] = {
            "gate_proj",
            "up_proj",
        ],
-        "experts":
-        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
+        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
    },
-    "glm4_moe_lite":  {
+    "glm4_moe_lite": {
        "gate_up_proj": ["gate_proj", "up_proj"],
-        "experts":
-        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
-        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"]
+        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
+        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "longcat_flash": {
        "gate_up_proj": ["gate_proj", "up_proj"],
-        "experts":
-        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
-        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"]
+        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
+        "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
    },
    "minimax_m2": {
        "qkv_proj": [
@@ -184,17 +169,17 @@ packed_modules_model_mapping: Dict[str, Dict[str, List[str]]] = {
            "k_proj",
            "v_proj",
        ],
-        "experts": ["experts.0.w1", "experts.0.w2", "experts.0.w3"]
-    }
+        "experts": ["experts.0.w1", "experts.0.w2", "experts.0.w3"],
+    },
 }


-def get_packed_modules_mapping(model_type: str) -> Dict[str, List[str]]:
+def get_packed_modules_mapping(model_type: str) -> dict[str, list[str]]:
    """Get packed modules mapping for a model type.
-    
+
    Args:
        model_type: The model type string (e.g., "deepseek_v3").
-        
+
    Returns:
        Dictionary mapping fused module names to their component module names.
        Returns empty dict if model_type is not found.
@@ -202,12 +187,12 @@ def get_packed_modules_mapping(model_type: str) -> Dict[str, List[str]]:
    return packed_modules_model_mapping.get(model_type, {})


-def get_prefix_mapping(model_type: str) -> Dict[str, str]:
+def get_prefix_mapping(model_type: str) -> dict[str, str]:
    """Get prefix mapping for a model type.
-    
+
    Args:
        model_type: The model type string (e.g., "qwen3_vl_moe").
-        
+
    Returns:
        Dictionary mapping original prefixes to new prefixes.
        Returns empty dict if model_type is not found.
@@ -216,15 +201,15 @@ def get_prefix_mapping(model_type: str) -> Dict[str, str]:


 def get_linear_quant_type(
-        quant_description: Dict[str, Any], prefix: str,
-        packed_modules_mapping: Dict[str, Any]) -> Optional[str]:
+    quant_description: dict[str, Any], prefix: str, packed_modules_mapping: dict[str, Any]
+) -> str | None:
    """Determine the quantization type for a linear layer.
-    
+
    Args:
        quant_description: The quantization description dictionary.
        prefix: The layer prefix.
        packed_modules_mapping: Mapping for packed/fused modules.
-        
+
    Returns:
        The quantization type string (e.g., "W8A8_DYNAMIC").
    """
@@ -232,11 +217,10 @@ def get_linear_quant_type(
    if proj_name in packed_modules_mapping:
        quant_type = None
        shard_prefixes = [
-            prefix.replace(proj_name, shard_proj_name)
-            for shard_proj_name in packed_modules_mapping[proj_name]
+            prefix.replace(proj_name, shard_proj_name) for shard_proj_name in packed_modules_mapping[proj_name]
        ]
        for shard_prefix in shard_prefixes:
-            shard_quant_type = quant_description[shard_prefix + '.weight']
+            shard_quant_type = quant_description[shard_prefix + ".weight"]

            if quant_type is None:
                quant_type = shard_quant_type
@@ -244,72 +228,68 @@ def get_linear_quant_type(
                raise ValueError(
                    f"Not all shards of {prefix} are quantized with same quant type."
                    f"Shard {proj_name} uses {shard_quant_type}, but another shard"
-                    f"use {quant_type}. Please check quantization config.")
+                    f"use {quant_type}. Please check quantization config."
+                )
    else:
-        quant_type = quant_description[prefix + '.weight']
+        quant_type = quant_description[prefix + ".weight"]
    return quant_type


 def get_quant_type_for_layer(
-        quant_description: Dict[str, Any],
-        prefix: str,
-        layer_type: str,
-        packed_modules_mapping: Optional[Dict[str,
-                                              Any]] = None) -> Optional[str]:
+    quant_description: dict[str, Any],
+    prefix: str,
+    layer_type: str,
+    packed_modules_mapping: dict[str, Any] | None = None,
+) -> str | None:
    """Determine the quantization type for a layer.
-    
+
    Args:
        quant_description: The quantization description dictionary.
        prefix: The layer prefix.
        layer_type: The type of layer ("linear", "moe", "attention").
        packed_modules_mapping: Mapping for packed/fused modules.
-        
+
    Returns:
        The quantization type string (e.g., "W8A8_DYNAMIC").
    """
    if packed_modules_mapping is None:
        packed_modules_mapping = dict()
    # Attention
-    if layer_type == "attention" and 'fa_quant_type' in quant_description.keys(
-    ):
-        return quant_description['fa_quant_type']
+    if layer_type == "attention" and "fa_quant_type" in quant_description:
+        return quant_description["fa_quant_type"]
    # Linear / MoE
-    return get_linear_quant_type(quant_description, prefix,
-                                 packed_modules_mapping)
+    return get_linear_quant_type(quant_description, prefix, packed_modules_mapping)


 def create_scheme_for_layer(
-        quant_description: Dict[str, Any],
-        prefix: str,
-        layer_type: str,
-        packed_modules_mapping: Optional[Dict[str, Any]] = None):
+    quant_description: dict[str, Any],
+    prefix: str,
+    layer_type: str,
+    packed_modules_mapping: dict[str, Any] | None = None,
+):
    """Create a quantization scheme instance for a layer.
-    
+
    Args:
        quant_description: The quantization description dictionary.
        prefix: The layer prefix.
        layer_type: The type of layer ("linear", "moe", "attention").
        packed_modules_mapping: Mapping for packed/fused modules.
-        
+
    Returns:
        An instance of the appropriate quantization scheme class.
    """
    logger.info_once("Using the vLLM Ascend modelslim Quantization now!")
-    quant_type = get_quant_type_for_layer(quant_description, prefix,
-                                          layer_type, packed_modules_mapping)
+    quant_type = get_quant_type_for_layer(quant_description, prefix, layer_type, packed_modules_mapping)

    if quant_type is None:
-        raise ValueError(
-            f"Could not determine quantization type for layer {prefix}.")
+        raise ValueError(f"Could not determine quantization type for layer {prefix}.")

    # Use registry to get scheme class
    scheme_cls = get_scheme_class(quant_type, layer_type)
    if scheme_cls is not None:
        return scheme_cls()

-    raise NotImplementedError(
-        f"Currently, vLLM Ascend doesn't support {quant_type} for {layer_type}."
-    )
+    raise NotImplementedError(f"Currently, vLLM Ascend doesn't support {quant_type} for {layer_type}.")


@register_quantization_config(ASCEND_QUANTIZATION_METHOD)
@@ -321,13 +301,13 @@ class AscendModelSlimConfig(QuantizationConfig):
    quantized using the ModelSlim tool.
    """

-    def __init__(self, quant_config: Dict[str, Any]):
+    def __init__(self, quant_config: dict[str, Any]):
        super().__init__()
        self.quant_description = quant_config
        # TODO(whx): remove this adaptation after adding "shared_head"
        # to prefix of DeepSeekShareHead in vLLM.
        extra_quant_dict = {}
-        for k in self.quant_description.keys():
+        for k in self.quant_description:
            if "shared_head" in k:
                new_k = k.replace(".shared_head.", ".")
                extra_quant_dict[new_k] = self.quant_description[k]
@@ -344,25 +324,23 @@ class AscendModelSlimConfig(QuantizationConfig):
        return ASCEND_QUANTIZATION_METHOD

    @classmethod
-    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
        return [torch.int8, torch.float16, torch.bfloat16]

    @classmethod
    def get_min_capability(cls) -> int:
-        raise NotImplementedError(
-            "Ascend hardware dose not support \"get_min_capability\" feature.")
+        raise NotImplementedError('Ascend hardware dose not support "get_min_capability" feature.')

    @classmethod
-    def get_config_filenames(cls) -> List[str]:
+    def get_config_filenames(cls) -> list[str]:
        return ["quant_model_description.json"]

    @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> "AscendModelSlimConfig":
+    def from_config(cls, config: dict[str, Any]) -> "AscendModelSlimConfig":
        return cls(config)

    @classmethod
-    def override_quantization_method(cls, hf_quant_cfg,
-                                     user_quant) -> Optional[str]:
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> str | None:
        if hf_quant_cfg is not None:
            quant_method = hf_quant_cfg.get("quant_method", None)
            if not quant_method and torch.npu.is_available():
@@ -373,15 +351,17 @@ class AscendModelSlimConfig(QuantizationConfig):
        # TODO (Levi-JQ): will be removed when QuantizationConfig.apply_vllm_mapper is implemented
        prefix_mapping = QUANT_MODEL_PREFIX_MAPPINGS.get(model_type)
        if prefix_mapping:
-            hf_to_vllm_mapper = WeightsMapper(
-                orig_to_new_prefix=prefix_mapping)
+            hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix=prefix_mapping)
            return hf_to_vllm_mapper._map_name(prefix)
        return prefix

-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QuantizeMethodBase"]:
-        from .method_adapters import (AscendEmbeddingMethod, AscendFusedMoEMethod,
-                               AscendKVCacheMethod, AscendLinearMethod)
+    def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]:
+        from .method_adapters import (
+            AscendEmbeddingMethod,
+            AscendFusedMoEMethod,
+            AscendKVCacheMethod,
+            AscendLinearMethod,
+        )

        vllm_config = get_current_vllm_config()
        model_type = vllm_config.model_config.hf_config.model_type
@@ -390,81 +370,67 @@ class AscendModelSlimConfig(QuantizationConfig):
            # Adapt to Minimax architecture: update layer names to MoE convention
            prefix = prefix.replace("mlp", "block_sparse_moe")
            # Normalize the prefix by stripping specific expert indices (e.g., 'experts.0' -> 'experts')
-            parts = prefix.split('.')
+            parts = prefix.split(".")
            if "experts" in parts and len(parts) > 2:
                exp_idx = parts.index("experts")
                if exp_idx + 1 < len(parts) and parts[exp_idx + 1].isdigit():
-                    parts = parts[:exp_idx + 1]
+                    parts = parts[: exp_idx + 1]
                    prefix = ".".join(parts)

        if model_type in packed_modules_model_mapping:
-            self.packed_modules_mapping = packed_modules_model_mapping[
-                model_type]
+            self.packed_modules_mapping = packed_modules_model_mapping[model_type]
        prefix = self.quant_prefix_mapper(model_type, prefix)

        from vllm_ascend.utils import vllm_version_is
+
        if vllm_version_is("v0.15.0"):
-            from vllm.attention.layer import Attention # type: ignore
+            from vllm.attention.layer import Attention  # type: ignore
        else:
            from vllm.model_executor.layers.attention import Attention

        if prefix.startswith("language_model"):
-            prefix = prefix.split('.', 1)[-1]
+            prefix = prefix.split(".", 1)[-1]
        if isinstance(layer, LinearBase):
-            if self.is_layer_skipped_ascend(prefix,
-                                            self.packed_modules_mapping):
+            if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping):
                # Delayed import to avoid circular import
-                from vllm_ascend.ops.linear import \
-                    AscendUnquantizedLinearMethod
+                from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
+
                return AscendUnquantizedLinearMethod()
-            scheme = create_scheme_for_layer(self.quant_description, prefix,
-                                             "linear",
-                                             self.packed_modules_mapping)
+            scheme = create_scheme_for_layer(self.quant_description, prefix, "linear", self.packed_modules_mapping)
            return AscendLinearMethod(scheme)
-        elif isinstance(layer, Attention) and \
-            'fa_quant_type' in self.quant_description.keys() and \
-            self.quant_description['fa_quant_type'] is not None:
-            scheme = create_scheme_for_layer(self.quant_description, prefix,
-                                             "attention",
-                                             self.packed_modules_mapping)
+        elif (
+            isinstance(layer, Attention)
+            and "fa_quant_type" in self.quant_description
+            and self.quant_description["fa_quant_type"] is not None
+        ):
+            scheme = create_scheme_for_layer(self.quant_description, prefix, "attention", self.packed_modules_mapping)
            return AscendKVCacheMethod(scheme)
        elif isinstance(layer, FusedMoE):
-            if self.is_layer_skipped_ascend(prefix,
-                                            self.packed_modules_mapping):
+            if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping):
                # Delayed import to avoid circular import
-                from vllm_ascend.ops.fused_moe.fused_moe import \
-                    AscendUnquantizedFusedMoEMethod
+                from vllm_ascend.ops.fused_moe.fused_moe import AscendUnquantizedFusedMoEMethod
+
                return AscendUnquantizedFusedMoEMethod(layer.moe_config)
-            scheme = create_scheme_for_layer(self.quant_description, prefix,
-                                             "moe",
-                                             self.packed_modules_mapping)
+            scheme = create_scheme_for_layer(self.quant_description, prefix, "moe", self.packed_modules_mapping)
            return AscendFusedMoEMethod(scheme, layer.moe_config)
        elif isinstance(layer, VocabParallelEmbedding):
-            if self.is_layer_skipped_ascend(prefix,
-                                            self.packed_modules_mapping):
+            if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping):
                return UnquantizedEmbeddingMethod()
-            scheme = create_scheme_for_layer(self.quant_description, prefix,
-                                             "linear",
-                                             self.packed_modules_mapping)
+            scheme = create_scheme_for_layer(self.quant_description, prefix, "linear", self.packed_modules_mapping)
            return AscendEmbeddingMethod(scheme)
        return None

-    def is_layer_skipped_ascend(
-        self,
-        prefix: str,
-        fused_mapping: Mapping[str, List[str]] = MappingProxyType({})):
+    def is_layer_skipped_ascend(self, prefix: str, fused_mapping: Mapping[str, list[str]] = MappingProxyType({})):
        # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped
        proj_name = prefix.split(".")[-1]
        if proj_name in fused_mapping:
            shard_prefixes = [
-                prefix.replace(proj_name, shard_proj_name)
-                for shard_proj_name in fused_mapping[proj_name]
+                prefix.replace(proj_name, shard_proj_name) for shard_proj_name in fused_mapping[proj_name]
            ]

            is_skipped = None
            for shard_prefix in shard_prefixes:
-                is_shard_skipped = self.quant_description[shard_prefix +
-                                                          '.weight'] == "FLOAT"
+                is_shard_skipped = self.quant_description[shard_prefix + ".weight"] == "FLOAT"

                if is_skipped is None:
                    is_skipped = is_shard_skipped
@@ -472,12 +438,13 @@ class AscendModelSlimConfig(QuantizationConfig):
                    raise ValueError(
                        f"Detected some but not all shards of {prefix} "
                        "are quantized. All shards of fused layers "
-                        "to have the same precision.")
+                        "to have the same precision."
+                    )
        else:
-            is_skipped = self.quant_description[prefix + '.weight'] == "FLOAT"
+            is_skipped = self.quant_description[prefix + ".weight"] == "FLOAT"

        assert is_skipped is not None
        return is_skipped

-    def get_scaled_act_names(self) -> List[str]:
+    def get_scaled_act_names(self) -> list[str]:
        return []