[CPU] Optimize FP16 decode_attention_cpu (#10652)

This commit is contained in:
blzheng
2025-10-23 12:39:51 +08:00
committed by GitHub
parent 81fd2b0ee0
commit 13fb8b5489
4 changed files with 181 additions and 9 deletions

View File

@@ -540,7 +540,10 @@ class ParallelLMHead(VocabParallelEmbedding):
# We only support pack LMHead if it's not quantized.
if _is_cpu and _is_cpu_amx_available:
if hasattr(self, "weight") and self.weight.dtype == torch.bfloat16:
if hasattr(self, "weight") and self.weight.dtype in [
torch.bfloat16,
torch.float16,
]:
self.quant_method = PackWeightMethod(weight_names=["weight"])
if bias: