[CPU] Optimize FP16 decode_attention_cpu (#10652)

2025-10-23 12:39:51 +08:00
parent 81fd2b0ee0
commit 13fb8b5489
4 changed files with 181 additions and 9 deletions
--- a/python/sglang/srt/layers/vocab_parallel_embedding.py
+++ b/python/sglang/srt/layers/vocab_parallel_embedding.py
@@ -540,7 +540,10 @@ class ParallelLMHead(VocabParallelEmbedding):

        # We only support pack LMHead if it's not quantized.
        if _is_cpu and _is_cpu_amx_available:
-            if hasattr(self, "weight") and self.weight.dtype == torch.bfloat16:
+            if hasattr(self, "weight") and self.weight.dtype in [
+                torch.bfloat16,
+                torch.float16,
+            ]:
                self.quant_method = PackWeightMethod(weight_names=["weight"])

        if bias: