[Feat.][310P]: weightNZ feature with quant or unquant. (#6705)
NZ Format Support for Linear Layers: Implemented support for the NZ
(N-dimensional Z-order) format for linear layer weights on Ascend 310P,
enhancing performance for both quantized and unquantized layers.
Unquantized Linear Method for Ascend 310P: Introduced
AscendUnquantizedLinearMethod310 to specifically handle and apply NZ
format casting to unquantized linear layer weights during the loading
process.
MRotaryEmbedding Integration: Extended Rotary Embedding support by
adding AscendMRotaryEmbedding310 to provide an Ascend-specific
implementation for MRotaryEmbedding.
Quantization Method Updates: Updated the w8a8_static quantization method
to directly transpose weights and apply NZ format casting, ensuring
consistency with the new format.
- vLLM version: v0.15.0
- vLLM main:
9562912cea
---------
Signed-off-by: Tflowers-0129 <2906339855@qq.com>
This commit is contained in:
@@ -21,6 +21,7 @@ import torch
|
||||
import torch_npu
|
||||
|
||||
from vllm_ascend.quantization.methods.base import AscendLinearScheme
|
||||
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ
|
||||
|
||||
from .registry import register_scheme
|
||||
|
||||
@@ -72,9 +73,15 @@ class AscendW8A8LinearMethod310(AscendLinearScheme):
|
||||
|
||||
quant_bias = layer.quant_bias if tp_rank == 0 else None
|
||||
|
||||
# NOTE(310P):
|
||||
# - Current torch_npu.npu_quant_matmul on Ascend 310P expects the weight layout in a transposed form
|
||||
# for correct/efficient execution, so we pass `layer.weight.T` here.
|
||||
# - This is a temporary workaround. The planned replacement quant-matmul op will accept the
|
||||
# canonical (non-transposed) weight layout directly, so this explicit transpose will be removed
|
||||
# once that op is enabled on 310P.
|
||||
return torch_npu.npu_quant_matmul(
|
||||
x,
|
||||
layer.weight,
|
||||
layer.weight.data,
|
||||
layer.deq_scale,
|
||||
bias=quant_bias,
|
||||
output_dtype=layer.params_dtype,
|
||||
@@ -82,6 +89,8 @@ class AscendW8A8LinearMethod310(AscendLinearScheme):
|
||||
|
||||
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
||||
expanding_factor = layer.weight.data.shape[1]
|
||||
|
||||
# ---- quant stage tensors ----
|
||||
layer.aclnn_input_scale = torch.nn.Parameter(
|
||||
layer.input_scale.data.repeat(expanding_factor),
|
||||
requires_grad=False,
|
||||
@@ -95,7 +104,9 @@ class AscendW8A8LinearMethod310(AscendLinearScheme):
|
||||
requires_grad=False,
|
||||
).to(layer.aclnn_input_scale.dtype)
|
||||
|
||||
layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
|
||||
# ---- matmul stage tensor ----
|
||||
layer.weight.data = torch_npu.npu_format_cast(layer.weight.data, ACL_FORMAT_FRACTAL_NZ).transpose(0, 1)
|
||||
|
||||
# ---- dequant stage tensors ----
|
||||
layer.weight_scale.data = torch.flatten(layer.weight_scale.data)
|
||||
layer.weight_offset.data = torch.flatten(layer.weight_offset.data)
|
||||
|
||||
@@ -104,9 +104,9 @@ class AscendModelSlimConfig310(AscendModelSlimConfig):
|
||||
if isinstance(layer, LinearBase):
|
||||
packed = getattr(self, "packed_modules_mapping", {})
|
||||
if self.is_layer_skipped_ascend(prefix, packed):
|
||||
from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
|
||||
from vllm_ascend._310p.ops.linear import AscendUnquantizedLinearMethod310
|
||||
|
||||
return AscendUnquantizedLinearMethod()
|
||||
return AscendUnquantizedLinearMethod310()
|
||||
|
||||
scheme = create_scheme_for_layer(
|
||||
quant_description=self.quant_description,
|
||||
|
||||
Reference in New Issue
Block a user