[Bugfix] Remove ModelSlim-"M4 Quantization". (#4589)

The M4 quantization method in ModelSlim adds bias to model weights that originally do not have a linear bias. PR #4235 supported PD-MIX quantization and M4 quantization, adding bias to `w8a8.py` and `w8a8_dynamic.py`, and implementing adaptations in `ops/linear.py` to prevent it from being reset to `None` by `self.register_parameter("bias", None)`. However, this modification introduced an issue where the bias was still being reset to `None` in certain scenarios, causing errors during service startup. Therefore, support for M4 quantization is temporarily being reverted in this PR. ___ - vLLM version: v0.11.2 Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2025-12-01 23:45:02 +08:00
parent 8813832387
commit 12ca99c94e
3 changed files with 7 additions and 22 deletions
--- a/vllm_ascend/ops/linear.py
+++ b/vllm_ascend/ops/linear.py
@@ -277,20 +277,18 @@ class AscendRowParallelLinear(RowParallelLinear):
            weight_loader=(
                self.weight_loader_v2 if self.quant_method.__class__.__name__
                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
-        bias_initialized_by_quant = ("bias" in self._parameters
-                                     and self._parameters["bias"] is not None)
        if not reduce_results and (bias and not skip_bias_add):
            raise ValueError("When not reduce the results, adding bias to the "
                             "results can lead to incorrect results")

-        if bias and not bias_initialized_by_quant:
+        if bias:
            self.bias = Parameter(
                torch.empty(self.output_size, dtype=params_dtype))
            set_weight_attrs(self.bias, {
                "output_dim": 0,
                "weight_loader": self.weight_loader,
            })
-        elif not bias and not bias_initialized_by_quant:
+        else:
            self.register_parameter("bias", None)

        if self.custom_op is not None:
@@ -368,9 +366,7 @@ class AscendColumnParallelLinear(ColumnParallelLinear):
            weight_loader=(
                self.weight_loader_v2 if self.quant_method.__class__.__name__
                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
-        bias_initialized_by_quant = ("bias" in self._parameters
-                                     and self._parameters["bias"] is not None)
-        if bias and not bias_initialized_by_quant:
+        if bias:
            self.bias = Parameter(
                torch.empty(self.output_size_per_partition,
                            dtype=params_dtype))
@@ -378,7 +374,7 @@ class AscendColumnParallelLinear(ColumnParallelLinear):
                "output_dim": 0,
                "weight_loader": self.weight_loader,
            })
-        elif not bias and not bias_initialized_by_quant:
+        else:
            self.register_parameter("bias", None)

        if self.custom_op is not None:
@@ -449,16 +445,14 @@ class AscendReplicatedLinear(ReplicatedLinear):
                                         self.params_dtype,
                                         weight_loader=self.weight_loader)

-        bias_initialized_by_quant = ("bias" in self._parameters
-                                     and self._parameters["bias"] is not None)
-        if bias and not bias_initialized_by_quant:
+        if bias:
            self.bias = Parameter(
                torch.empty(self.output_size, dtype=self.params_dtype))
            set_weight_attrs(self.bias, {
                "output_dim": 0,
                "weight_loader": self.weight_loader,
            })
-        elif not bias and not bias_initialized_by_quant:
+        else:
            self.register_parameter("bias", None)

        if self.custom_op is not None: