first commit

2026-03-10 13:31:25 +08:00
parent ba974cecfa
commit b62b889355
2604 changed files with 438977 additions and 0 deletions
--- a/vllm_br/model_executor/layers/logits_processor.py
+++ b/vllm_br/model_executor/layers/logits_processor.py
@@ -0,0 +1,72 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+
+from typing import Optional
+
+import torch
+import torch_br
+from fastcore.basics import patch_to
+
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm_br import envs
+
+
+# TODO(shouqing): need to open this patch when fix hang in mtp
+@patch_to(LogitsProcessor)
+def _get_logits(
+    self,
+    hidden_states: torch.Tensor,
+    lm_head: VocabParallelEmbedding,
+    embedding_bias: Optional[torch.Tensor],
+) -> Optional[torch.Tensor]:
+    # Get the logits for the next tokens.
+    logits = lm_head.quant_method.apply(lm_head,
+                                        hidden_states,
+                                        bias=embedding_bias)
+
+    spc_num = envs.VLLM_BR_DEVICE_SPC_NUM
+    if spc_num > 16:
+        bb_input = torch_br._empty_ut_only(size=logits.shape,
+                                           dtype=logits.dtype,
+                                           is_numa=False,
+                                           device=logits.device,
+                                           tensor_type="colmajor")
+
+        # work around the hang in s1b copy to bb
+        bb_input.copy_(logits)
+        logits = bb_input
+
+    tp_size = get_tensor_model_parallel_world_size()
+    tp_rank = get_tensor_model_parallel_rank()
+
+    logits_ = torch.zeros((logits.shape[0], logits.shape[-1] * tp_size),
+                          dtype=logits.dtype,
+                          device=logits.device)
+
+    start = logits.shape[-1] * tp_rank
+    end = start + logits.shape[-1]
+    logits_[:, start:end].copy_(logits)
+    logits = tensor_model_parallel_all_reduce(logits_)
+
+    # Remove paddings in vocab (if any).
+    if logits is not None:
+        logits = logits[..., :self.org_vocab_size]
+    return logits