diff --git a/vllm-v0.6.2/vllm/model_executor/models/transformers/causal.py b/vllm-v0.6.2/vllm/model_executor/models/transformers/causal.py index 33508fd..835b2ef 100644 --- a/vllm-v0.6.2/vllm/model_executor/models/transformers/causal.py +++ b/vllm-v0.6.2/vllm/model_executor/models/transformers/causal.py @@ -114,15 +114,12 @@ class CausalMixin: # Non-last PP rank return None - # Apply lm_head (at wrapper level, not self.model.lm_head) - output = self.lm_head(hidden_states) - # Handle tuple output from vLLM Linear layers (output, bias) - if isinstance(output, tuple): - logits = output[0] - else: - logits = output + # In v0.6.2, LogitsProcessor handles the lm_head projection internally + # via lm_head.linear_method.apply(). Pass lm_head as the first arg. + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) - return self.logits_processor(None, logits, sampling_metadata) + return logits def sample( self,