From ebdc6fed03b2cf0198c305c5270e8caaa80ff8d8 Mon Sep 17 00:00:00 2001
From: Chranos <826995883@qq.com>
Date: Fri, 6 Feb 2026 14:21:14 +0800
Subject: [PATCH] fix: pass lm_head to LogitsProcessor instead of calling
 forward()

In vLLM v0.6.2, ParallelLMHead.forward() raises RuntimeError since
its weights should be used through LogitsProcessor.linear_method.apply().
Pass lm_head as first arg to LogitsProcessor which handles the
hidden_states -> logits projection internally.
---
 .../model_executor/models/transformers/causal.py    | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/vllm-v0.6.2/vllm/model_executor/models/transformers/causal.py b/vllm-v0.6.2/vllm/model_executor/models/transformers/causal.py
index 33508fd..835b2ef 100644
--- a/vllm-v0.6.2/vllm/model_executor/models/transformers/causal.py
+++ b/vllm-v0.6.2/vllm/model_executor/models/transformers/causal.py
@@ -114,15 +114,12 @@ class CausalMixin:
             # Non-last PP rank
             return None
         
-        # Apply lm_head (at wrapper level, not self.model.lm_head)
-        output = self.lm_head(hidden_states)
-        # Handle tuple output from vLLM Linear layers (output, bias)
-        if isinstance(output, tuple):
-            logits = output[0]
-        else:
-            logits = output
+        # In v0.6.2, LogitsProcessor handles the lm_head projection internally
+        # via lm_head.linear_method.apply(). Pass lm_head as the first arg.
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
         
-        return self.logits_processor(None, logits, sampling_metadata)
+        return logits
     
     def sample(
         self,