From cba7ad6c59e2d97ef698e38731f748a574ba8a1b Mon Sep 17 00:00:00 2001
From: Chranos <826995883@qq.com>
Date: Wed, 11 Feb 2026 15:17:07 +0800
Subject: [PATCH] add deepseekv3 and llama4

---
 .../vllm_mlu/vllm_mlu/model_executor/models/llama4.py  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm-v0.6.2/vllm_mlu/vllm_mlu/model_executor/models/llama4.py b/vllm-v0.6.2/vllm_mlu/vllm_mlu/model_executor/models/llama4.py
index 495b2a4..c147c1d 100644
--- a/vllm-v0.6.2/vllm_mlu/vllm_mlu/model_executor/models/llama4.py
+++ b/vllm-v0.6.2/vllm_mlu/vllm_mlu/model_executor/models/llama4.py
@@ -176,14 +176,14 @@ def vllm__llama4__Llama4Attention__forward(
     ==================
     '''
 
-    # QK norm (教训 #2: use contiguous + reshape)
+    # QK norm (MLU fused_rms_norm requires matching dtypes, skip .float())
     if self.qk_norm is not None:
         q = q.contiguous().reshape(-1, self.head_dim)
-        q = (self.qk_norm(q.float())
-             .contiguous().reshape(-1, self.q_size).to(q.dtype))
+        q = (self.qk_norm(q)
+             .contiguous().reshape(-1, self.q_size))
         k = k.contiguous().reshape(-1, self.head_dim)
-        k = (self.qk_norm(k.float())
-             .contiguous().reshape(-1, self.kv_size).to(k.dtype))
+        k = (self.qk_norm(k)
+             .contiguous().reshape(-1, self.kv_size))
 
     # Temperature tuning for NoPE layers
     if self.attn_temperature_tuning and self.nope: