################################################################################ # Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ################################################################################ from typing import Optional import torch import torch_br from fastcore.basics import patch_to from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm_br import envs # TODO(shouqing): need to open this patch when fix hang in mtp @patch_to(LogitsProcessor) def _get_logits( self, hidden_states: torch.Tensor, lm_head: VocabParallelEmbedding, embedding_bias: Optional[torch.Tensor], ) -> Optional[torch.Tensor]: # Get the logits for the next tokens. logits = lm_head.quant_method.apply(lm_head, hidden_states, bias=embedding_bias) spc_num = envs.VLLM_BR_DEVICE_SPC_NUM if spc_num > 16: bb_input = torch_br._empty_ut_only(size=logits.shape, dtype=logits.dtype, is_numa=False, device=logits.device, tensor_type="colmajor") # work around the hang in s1b copy to bb bb_input.copy_(logits) logits = bb_input tp_size = get_tensor_model_parallel_world_size() tp_rank = get_tensor_model_parallel_rank() logits_ = torch.zeros((logits.shape[0], logits.shape[-1] * tp_size), dtype=logits.dtype, device=logits.device) start = logits.shape[-1] * tp_rank end = start + logits.shape[-1] logits_[:, start:end].copy_(logits) logits = tensor_model_parallel_all_reduce(logits_) # Remove paddings in vocab (if any). if logits is not None: logits = logits[..., :self.org_vocab_size] return logits