From ebdc6fed03b2cf0198c305c5270e8caaa80ff8d8 Mon Sep 17 00:00:00 2001 From: Chranos <826995883@qq.com> Date: Fri, 6 Feb 2026 14:21:14 +0800 Subject: [PATCH] fix: pass lm_head to LogitsProcessor instead of calling forward() In vLLM v0.6.2, ParallelLMHead.forward() raises RuntimeError since its weights should be used through LogitsProcessor.linear_method.apply(). Pass lm_head as first arg to LogitsProcessor which handles the hidden_states -> logits projection internally. --- .../model_executor/models/transformers/causal.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/vllm-v0.6.2/vllm/model_executor/models/transformers/causal.py b/vllm-v0.6.2/vllm/model_executor/models/transformers/causal.py index 33508fd..835b2ef 100644 --- a/vllm-v0.6.2/vllm/model_executor/models/transformers/causal.py +++ b/vllm-v0.6.2/vllm/model_executor/models/transformers/causal.py @@ -114,15 +114,12 @@ class CausalMixin: # Non-last PP rank return None - # Apply lm_head (at wrapper level, not self.model.lm_head) - output = self.lm_head(hidden_states) - # Handle tuple output from vLLM Linear layers (output, bias) - if isinstance(output, tuple): - logits = output[0] - else: - logits = output + # In v0.6.2, LogitsProcessor handles the lm_head projection internally + # via lm_head.linear_method.apply(). Pass lm_head as the first arg. + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) - return self.logits_processor(None, logits, sampling_metadata) + return logits def sample( self,