[Quantization]300I Duo support w8a8 quantization (#1560)

### What this PR does / why we need it?
This pr supports w8a8 on 300I Duo platform. The main change is to use
`npu_quant_grouped_matmul_dequant` to replace `npu_grouped_matmul`.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
offline inference on 310p runs normally.

---------

Signed-off-by: angazenn <zengyanjia@huawei.com>
Signed-off-by: tianyitang <tangtianyi4@huawei.com>
Co-authored-by: angazenn <zengyanjia@huawei.com>
Co-authored-by: tianyitang <tangtianyi4@huawei.com>
This commit is contained in:
Angazenn
2025-07-03 22:12:46 +08:00
committed by GitHub
parent 6d7cb14a24
commit 9fbd8017c0
5 changed files with 369 additions and 41 deletions

View File

@@ -23,6 +23,7 @@ from vllm.attention.backends.abstract import AttentionType
from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.distributed.parallel_state import get_ep_group
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p
def quant_per_tensor(in_tensor: torch.Tensor,
@@ -42,7 +43,7 @@ class AscendW8A8LinearMethod:
def __init__(self) -> None:
# aclnn quant matmul requires to transpose matrix B, set to true by default.
self.transpose_weight = True
self.transpose_weight = not is_310p()
@staticmethod
def get_weight(
@@ -95,13 +96,24 @@ class AscendW8A8LinearMethod:
x = quant_per_tensor(x, layer.aclnn_input_scale,
layer.aclnn_input_offset)
quant_bias = layer.quant_bias if tp_rank == 0 else None
output = torch_npu.npu_quant_matmul(
x,
layer.weight,
layer.deq_scale,
bias=quant_bias,
output_dtype=original_dtype,
)
if is_310p():
# On 300I Duo platform, we need transpose again if
# using nz. This transpose can be skipped in torchair.
output = torch_npu.npu_quant_matmul(
x,
layer.weight.data.transpose(1, 0),
layer.deq_scale,
bias=quant_bias,
output_dtype=original_dtype,
)
else:
output = torch_npu.npu_quant_matmul(
x,
layer.weight,
layer.deq_scale,
bias=quant_bias,
output_dtype=original_dtype,
)
return output
def process_weights_after_loading(self, layer):
@@ -114,7 +126,8 @@ class AscendW8A8LinearMethod:
requires_grad=False).to(layer.aclnn_input_scale.dtype)
if self.transpose_weight:
layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
layer.weight.data = torch_npu.npu_format_cast(layer.weight.data, 29)
layer.weight.data = torch_npu.npu_format_cast(layer.weight.data,
ACL_FORMAT_FRACTAL_NZ)
layer.weight_scale.data = torch.flatten(layer.weight_scale.data)
layer.weight_offset.data = torch.flatten(layer.weight_offset.data)
@@ -232,6 +245,19 @@ class AscendW8A8FusedMoEMethod:
global_num_experts=global_num_experts,
)
if is_310p():
return fused_experts_310p(hidden_states=x,
w1=layer.w13_weight,
w1_scale=layer.w13_weight_scale,
w1_input_scale=layer.w13_input_scale,
w2=layer.w2_weight,
w2_scale=layer.w2_weight_scale,
w2_input_scale=layer.w2_input_scale,
topk_weights=topk_weights,
topk_ids=topk_ids,
top_k=top_k,
global_num_experts=global_num_experts,
expert_map=expert_map)
return fused_experts(hidden_states=x,
w1=layer.w13_weight,
w1_scale=layer.w13_weight_scale,
@@ -248,41 +274,48 @@ class AscendW8A8FusedMoEMethod:
expert_map=expert_map)
def process_weights_after_loading(self, layer):
# torch.npu.config.allow_internal_format = True
layer.w13_weight.data = layer.w13_weight.data.transpose(
1, 2).contiguous()
layer.w2_weight.data = layer.w2_weight.data.transpose(1,
2).contiguous()
if not is_310p():
layer.w13_weight.data = layer.w13_weight.data.transpose(
1, 2).contiguous()
layer.w2_weight.data = layer.w2_weight.data.transpose(
1, 2).contiguous()
layer.w13_weight_scale.data = layer.w13_weight_scale.data.view(
layer.w13_weight_scale.data.shape[0], -1).to(torch.float32)
layer.w13_weight_scale.data.shape[0], -1)
layer.w13_weight_offset.data = layer.w13_weight_offset.data.view(
layer.w13_weight_offset.data.shape[0], -1).to(torch.float16)
layer.w13_weight_offset.data.shape[0], -1)
layer.w2_weight_scale.data = layer.w2_weight_scale.data.view(
layer.w2_weight_scale.data.shape[0], -1).to(torch.float32)
layer.w2_weight_scale.data.shape[0], -1)
layer.w2_weight_offset.data = layer.w2_weight_offset.data.view(
layer.w2_weight_offset.data.shape[0], -1).to(torch.float16)
layer.w2_weight_offset.data.shape[0], -1)
expanding_factor_w13 = layer.w13_weight.data.shape[1]
expanding_factor_w2 = layer.w2_weight.data.shape[1]
layer.w13_input_scale.data = torch.nn.Parameter(
layer.w13_input_scale.data.repeat(
1, expanding_factor_w13)[0:1]).to(torch.float16)
layer.w2_input_scale.data = torch.nn.Parameter(
layer.w2_input_scale.data.repeat(1, expanding_factor_w2)[0:1]).to(
torch.float16)
if is_310p():
layer.w13_input_scale.data = torch.nn.Parameter(
layer.w13_input_scale.data.max())
layer.w2_input_scale.data = torch.nn.Parameter(
layer.w2_input_scale.data.max())
else:
layer.w13_input_scale.data = torch.nn.Parameter(
layer.w13_input_scale.data.repeat(1,
expanding_factor_w13)[0:1])
layer.w2_input_scale.data = torch.nn.Parameter(
layer.w2_input_scale.data.repeat(1, expanding_factor_w2)[0:1])
layer.w13_input_offset.data = torch.nn.Parameter(
layer.w13_input_scale.data.repeat(
1, expanding_factor_w13)[0:1]).to(torch.int8)
layer.w13_input_scale.data.repeat(1, expanding_factor_w13)[0:1])
layer.w2_input_offset.data = torch.nn.Parameter(
layer.w2_input_scale.data.repeat(1, expanding_factor_w2)[0:1]).to(
torch.int8)
layer.w2_input_scale.data.repeat(1, expanding_factor_w2)[0:1])
# NZ
layer.w13_weight.data = torch_npu.npu_format_cast(
layer.w13_weight.data, 29).contiguous()
layer.w2_weight.data = torch_npu.npu_format_cast(
layer.w2_weight.data, 29).contiguous()
# converting ACL_FORMAT_FRACTAL_NZ.
# npu_quant_grouped_matmul_dequant in eager mode does not accept
# ACL_FORMAT_FRACTAL_NZ.
if not is_310p():
layer.w13_weight.data = torch_npu.npu_format_cast(
layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ).contiguous()
layer.w2_weight.data = torch_npu.npu_format_cast(
layer.w2_weight.data, ACL_FORMAT_FRACTAL_NZ).contiguous()
class AscendC8KVCacheMethod:
@@ -407,6 +440,69 @@ class AscendC8KVCacheMethod:
return output
def fused_experts_310p(
hidden_states: torch.Tensor,
w1: torch.Tensor,
w1_scale: torch.Tensor,
w1_input_scale: torch.Tensor,
w2: torch.Tensor,
w2_scale: torch.Tensor,
w2_input_scale: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
top_k: int,
global_num_experts: int,
expert_map: torch.Tensor = None,
) -> torch.Tensor:
ep_size = get_ep_group().world_size
local_num_experts = global_num_experts // ep_size
local_num_group = top_k // ep_size
bsz, _ = hidden_states.shape
flatten_topk_ids = topk_ids.view(-1)
sorted_topk_ids = torch.argsort(flatten_topk_ids.float())
sorted_topk_ids = sorted_topk_ids.to(torch.int32)
sorted_hidden_states = hidden_states.index_select(
0, sorted_topk_ids // local_num_group)
experts_id = torch.arange(0,
local_num_experts,
dtype=topk_ids.dtype,
device=topk_ids.device)
num_tokens_per_expert = (flatten_topk_ids.unsqueeze(-1) == experts_id).to(
torch.float32).sum(0)
topk_scales = topk_weights.view(-1).index_select(
0, sorted_topk_ids).unsqueeze(-1)
group_list = num_tokens_per_expert.cumsum(dim=0).to(torch.int64)
gate_up_out = torch_npu.npu_quant_grouped_matmul_dequant(
x=sorted_hidden_states,
quantized_weight=w1,
weight_scale=w1_scale,
group_list=group_list,
x_scale=w1_input_scale,
quant_mode="pertensor")
gate_up_out = torch_npu.npu_swiglu(gate_up_out.to(torch.float32)).to(
torch.float16)
gate_up_out *= topk_scales
down_out = torch_npu.npu_quant_grouped_matmul_dequant(
x=gate_up_out,
quantized_weight=w2,
weight_scale=w2_scale,
group_list=group_list,
x_scale=w2_input_scale,
quant_mode="pertensor")
unsorted_topk_ids = torch.argsort(sorted_topk_ids.float()).to(torch.int32)
unsorted_hidden_states = down_out.index_select(0, unsorted_topk_ids)
final_hidden_states = unsorted_hidden_states.reshape(
bsz, top_k // ep_size, -1).sum(1)
return final_hidden_states
def fused_experts(
hidden_states: torch.Tensor,
w1: torch.Tensor,

View File

@@ -31,6 +31,7 @@ from torch_npu.npu.streams import Event
from vllm.logger import logger
import vllm_ascend.envs as envs
from vllm_ascend.ascend_config import get_ascend_config
try:
# Recent release of torchair has moved these ops to `.scope`.
@@ -175,6 +176,28 @@ def aligned_16(tensor: torch.Tensor):
return new_tensor
def maybe_converting_weight_acl_format(model, format=ACL_FORMAT_FRACTAL_NZ):
# currently, there are some operations which do not support ACL_FORMAT_FRACTAL_NZ
# in eager mode but support it in torchair graph mode. since ACL_FORMAT_FRACTAL_NZ
# is much more preferred than ACL_FORMAT_FRACTAL_ND on 300I Duo, we add this
# conversion when using torchair graph mode on 300I Duo platform.
# TODO: we will remove this conversion if npu_quant_grouped_matmul_dequant
# accepts weight format of ACL_FORMAT_FRACTAL_NZ in eager mode.
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
use_torchair = get_ascend_config().torchair_graph_config.enabled
if not is_310p() or not use_torchair:
return
for module in model.modules():
if isinstance(module, FusedMoE):
if torch_npu.get_npu_format(module.w13_weight.data) == format:
return
module.w13_weight.data = torch_npu.npu_format_cast(
module.w13_weight.data, format)
module.w2_weight.data = torch_npu.npu_format_cast(
module.w2_weight.data, format)
def try_register_lib(lib_name: str, lib_info: str = ""):
import importlib
import importlib.util

View File

@@ -77,6 +77,7 @@ from vllm_ascend.pool.metadata import PoolingMetadata
from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
ProfileExecuteDuration, is_310p,
maybe_converting_weight_acl_format,
vllm_version_is)
from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
@@ -1196,6 +1197,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
model_kwargs["kv_caches"] = self.kv_caches
model_kwargs["attn_metadata"] = attn_metadata
if self.torchair_graph_enabled and not with_prefill:
maybe_converting_weight_acl_format(self.model,
ACL_FORMAT_FRACTAL_NZ)
compiled_model = self._get_torchair_lazy_compiled_model(
padded_batch_size)
hidden_states = compiled_model(
@@ -1207,6 +1211,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
)
else:
assert self.model is not None
maybe_converting_weight_acl_format(self.model,
ACL_FORMAT_FRACTAL_ND)
hidden_states = self.model(
input_ids=input_ids,
positions=positions,
@@ -1878,6 +1885,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
kv, tuple), "kv_cache must be a tuple"
torch._dynamo.mark_static(kv[0])
torch._dynamo.mark_static(kv[1])
maybe_converting_weight_acl_format(self.model,
ACL_FORMAT_FRACTAL_NZ)
compiled_model = self._get_torchair_lazy_compiled_model(
num_tokens)
hidden_states = compiled_model(
@@ -1889,6 +1900,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
attn_metadata=attn_metadata,
)
else:
maybe_converting_weight_acl_format(self.model,
ACL_FORMAT_FRACTAL_ND)
hidden_states = model(
input_ids=input_ids,
positions=positions,