support pangumoe w8a8c8 and docs (#1477)

### What this PR does / why we need it?
support pangu moe w8a8c8

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
CI passed with new added test.

Signed-off-by: zhuyilin <809721801@qq.com>
This commit is contained in:
Zhu Yi Lin
2025-06-28 18:51:07 +08:00
committed by GitHub
parent c59d69d9e6
commit b308a7a258
8 changed files with 689 additions and 50 deletions

View File

@@ -98,6 +98,9 @@ class AscendQuantConfig(QuantizationConfig):
'fa_quant_type' in self.quant_description.keys() and \
self.quant_description['fa_quant_type'] is not None:
return AscendKVCacheMethod(self, prefix)
elif isinstance(layer, Attention) and self.quant_description.get(
'kv_quant_type') == 'C8':
return AscendKVCacheMethod(self, prefix)
elif isinstance(layer, FusedMoE):
if self.is_layer_skipped_ascend(prefix,
self.packed_modules_mapping):
@@ -235,32 +238,11 @@ class AscendKVCacheMethod(BaseKVCacheMethod):
if hasattr(self.quant_method, "process_weights_after_loading"):
self.quant_method.process_weights_after_loading(layer)
def apply(self,
layer: torch.nn.Module,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
k_cache: List[torch.Tensor],
v_cache: List[torch.Tensor],
scale: torch.Tensor,
block_tables: torch.Tensor,
isPrefill: bool,
attn_metadata,
output,
seq_lens_tensor_cpu: Optional[int] = None) -> torch.Tensor:
return self.quant_method.apply(layer,
query,
key,
value,
k_cache,
v_cache,
scale,
block_tables,
isPrefill,
attn_metadata.attn_mask,
attn_metadata.slot_mapping,
output,
seq_lens_tensor_cpu=seq_lens_tensor_cpu)
def apply(self, layer: torch.nn.Module, query: torch.Tensor,
key: torch.Tensor, value: torch.Tensor, kv_cache, attn_metadata,
attn_type, scale, output) -> torch.Tensor:
return self.quant_method.apply(layer, query, key, value, kv_cache,
attn_metadata, attn_type, scale, output)
class AscendFusedMoEMethod(FusedMoEMethodBase):