[Quantization]300I Duo support w8a8 quantization (#1560)

### What this PR does / why we need it?
This pr supports w8a8 on 300I Duo platform. The main change is to use
`npu_quant_grouped_matmul_dequant` to replace `npu_grouped_matmul`.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
offline inference on 310p runs normally.

---------

Signed-off-by: angazenn <zengyanjia@huawei.com>
Signed-off-by: tianyitang <tangtianyi4@huawei.com>
Co-authored-by: angazenn <zengyanjia@huawei.com>
Co-authored-by: tianyitang <tangtianyi4@huawei.com>
This commit is contained in:
Angazenn
2025-07-03 22:12:46 +08:00
committed by GitHub
parent 6d7cb14a24
commit 9fbd8017c0
5 changed files with 369 additions and 41 deletions

View File

@@ -77,6 +77,7 @@ from vllm_ascend.pool.metadata import PoolingMetadata
from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
ProfileExecuteDuration, is_310p,
maybe_converting_weight_acl_format,
vllm_version_is)
from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
@@ -1196,6 +1197,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
model_kwargs["kv_caches"] = self.kv_caches
model_kwargs["attn_metadata"] = attn_metadata
if self.torchair_graph_enabled and not with_prefill:
maybe_converting_weight_acl_format(self.model,
ACL_FORMAT_FRACTAL_NZ)
compiled_model = self._get_torchair_lazy_compiled_model(
padded_batch_size)
hidden_states = compiled_model(
@@ -1207,6 +1211,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
)
else:
assert self.model is not None
maybe_converting_weight_acl_format(self.model,
ACL_FORMAT_FRACTAL_ND)
hidden_states = self.model(
input_ids=input_ids,
positions=positions,
@@ -1878,6 +1885,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
kv, tuple), "kv_cache must be a tuple"
torch._dynamo.mark_static(kv[0])
torch._dynamo.mark_static(kv[1])
maybe_converting_weight_acl_format(self.model,
ACL_FORMAT_FRACTAL_NZ)
compiled_model = self._get_torchair_lazy_compiled_model(
num_tokens)
hidden_states = compiled_model(
@@ -1889,6 +1900,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
attn_metadata=attn_metadata,
)
else:
maybe_converting_weight_acl_format(self.model,
ACL_FORMAT_FRACTAL_ND)
hidden_states = model(
input_ids=input_ids,
positions=positions,