[feature] Support W8A8 PD-Mix Quantization (#4235)

In PD-separated deployment scenarios: * MoE layers use dynamic quantization exclusively. * For the Attention module, Prefill (P) nodes use **dynamic** quantization, while Decode (D) nodes use **static** quantization. In PD-mixed deployment scenarios: * **All components fall back to dynamic quantization**, as it is difficult to distinguish between Prefill and Decode tokens. ___ - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com> Signed-off-by: Slightwind <slightwindsec@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-11-30 11:57:26 +08:00
parent ff7061317f
commit 18eefc23c3
6 changed files with 93 additions and 7 deletions
--- a/vllm_ascend/ops/linear.py
+++ b/vllm_ascend/ops/linear.py
@@ -277,18 +277,20 @@ class AscendRowParallelLinear(RowParallelLinear):
            weight_loader=(
                self.weight_loader_v2 if self.quant_method.__class__.__name__
                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
+        bias_initialized_by_quant = ("bias" in self._parameters
+                                     and self._parameters["bias"] is not None)
        if not reduce_results and (bias and not skip_bias_add):
            raise ValueError("When not reduce the results, adding bias to the "
                             "results can lead to incorrect results")

-        if bias:
+        if bias and not bias_initialized_by_quant:
            self.bias = Parameter(
                torch.empty(self.output_size, dtype=params_dtype))
            set_weight_attrs(self.bias, {
                "output_dim": 0,
                "weight_loader": self.weight_loader,
            })
-        else:
+        elif not bias and not bias_initialized_by_quant:
            self.register_parameter("bias", None)

        if self.custom_op is not None:
@@ -366,7 +368,9 @@ class AscendColumnParallelLinear(ColumnParallelLinear):
            weight_loader=(
                self.weight_loader_v2 if self.quant_method.__class__.__name__
                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
-        if bias:
+        bias_initialized_by_quant = ("bias" in self._parameters
+                                     and self._parameters["bias"] is not None)
+        if bias and not bias_initialized_by_quant:
            self.bias = Parameter(
                torch.empty(self.output_size_per_partition,
                            dtype=params_dtype))
@@ -374,7 +378,7 @@ class AscendColumnParallelLinear(ColumnParallelLinear):
                "output_dim": 0,
                "weight_loader": self.weight_loader,
            })
-        else:
+        elif not bias and not bias_initialized_by_quant:
            self.register_parameter("bias", None)

        if self.custom_op is not None:
@@ -445,14 +449,16 @@ class AscendReplicatedLinear(ReplicatedLinear):
                                         self.params_dtype,
                                         weight_loader=self.weight_loader)

-        if bias:
+        bias_initialized_by_quant = ("bias" in self._parameters
+                                     and self._parameters["bias"] is not None)
+        if bias and not bias_initialized_by_quant:
            self.bias = Parameter(
                torch.empty(self.output_size, dtype=self.params_dtype))
            set_weight_attrs(self.bias, {
                "output_dim": 0,
                "weight_loader": self.weight_loader,
            })
-        else:
+        elif not bias and not bias_initialized_by_quant:
            self.register_parameter("bias", None)

        if self.custom_op is not None: