[Feat.][310P]: weightNZ feature with quant or unquant. (#6705)

NZ Format Support for Linear Layers: Implemented support for the NZ
(N-dimensional Z-order) format for linear layer weights on Ascend 310P,
enhancing performance for both quantized and unquantized layers.
Unquantized Linear Method for Ascend 310P: Introduced
AscendUnquantizedLinearMethod310 to specifically handle and apply NZ
format casting to unquantized linear layer weights during the loading
process.
MRotaryEmbedding Integration: Extended Rotary Embedding support by
adding AscendMRotaryEmbedding310 to provide an Ascend-specific
implementation for MRotaryEmbedding.
Quantization Method Updates: Updated the w8a8_static quantization method
to directly transpose weights and apply NZ format casting, ensuring
consistency with the new format.
- vLLM version: v0.15.0
- vLLM main:
9562912cea

---------

Signed-off-by: Tflowers-0129 <2906339855@qq.com>
This commit is contained in:
Shaoxu Cheng
2026-02-13 15:41:02 +08:00
committed by GitHub
parent f40256b697
commit b6bc3d2f9d
7 changed files with 144 additions and 17 deletions

View File

@@ -21,6 +21,7 @@ import torch
import torch_npu
from vllm_ascend.quantization.methods.base import AscendLinearScheme
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ
from .registry import register_scheme
@@ -72,9 +73,15 @@ class AscendW8A8LinearMethod310(AscendLinearScheme):
quant_bias = layer.quant_bias if tp_rank == 0 else None
# NOTE(310P):
# - Current torch_npu.npu_quant_matmul on Ascend 310P expects the weight layout in a transposed form
# for correct/efficient execution, so we pass `layer.weight.T` here.
# - This is a temporary workaround. The planned replacement quant-matmul op will accept the
# canonical (non-transposed) weight layout directly, so this explicit transpose will be removed
# once that op is enabled on 310P.
return torch_npu.npu_quant_matmul(
x,
layer.weight,
layer.weight.data,
layer.deq_scale,
bias=quant_bias,
output_dtype=layer.params_dtype,
@@ -82,6 +89,8 @@ class AscendW8A8LinearMethod310(AscendLinearScheme):
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
expanding_factor = layer.weight.data.shape[1]
# ---- quant stage tensors ----
layer.aclnn_input_scale = torch.nn.Parameter(
layer.input_scale.data.repeat(expanding_factor),
requires_grad=False,
@@ -95,7 +104,9 @@ class AscendW8A8LinearMethod310(AscendLinearScheme):
requires_grad=False,
).to(layer.aclnn_input_scale.dtype)
layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
# ---- matmul stage tensor ----
layer.weight.data = torch_npu.npu_format_cast(layer.weight.data, ACL_FORMAT_FRACTAL_NZ).transpose(0, 1)
# ---- dequant stage tensors ----
layer.weight_scale.data = torch.flatten(layer.weight_scale.data)
layer.weight_offset.data = torch.flatten(layer.weight_offset.data)

View File

@@ -104,9 +104,9 @@ class AscendModelSlimConfig310(AscendModelSlimConfig):
if isinstance(layer, LinearBase):
packed = getattr(self, "packed_modules_mapping", {})
if self.is_layer_skipped_ascend(prefix, packed):
from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
from vllm_ascend._310p.ops.linear import AscendUnquantizedLinearMethod310
return AscendUnquantizedLinearMethod()
return AscendUnquantizedLinearMethod310()
scheme = create_scheme_for_layer(
quant_description=self.quant_description,