first commit

2026-03-10 13:31:25 +08:00
parent ba974cecfa
commit b62b889355
2604 changed files with 438977 additions and 0 deletions
--- a/vllm_br/distributed/communicator.py
+++ b/vllm_br/distributed/communicator.py
@@ -0,0 +1,60 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+from vllm.distributed.device_communicators.base_device_communicator import (
+    DeviceCommunicatorBase)
+from vllm.logger import logger
+from vllm_br import envs
+
+
+class SUPACommunicator(DeviceCommunicatorBase):
+
+    def __init__(self,
+                 cpu_group: dist.ProcessGroup,
+                 device: Optional[torch.device] = None,
+                 device_group: Optional[dist.ProcessGroup] = None,
+                 unique_name: str = ""):
+        super().__init__(cpu_group, device, device_group, unique_name)
+        self.device = torch.supa.current_device()
+
+    # TODO: Deprecate this method in the future if torch_br support gather
+    def gather(self,
+               input_: torch.Tensor,
+               dst: int = 0,
+               dim: int = -1) -> torch.Tensor:
+        """All gather as gather"""
+
+        output_tensor = self.all_gather(input_, dim)
+        if self.rank_in_group == dst:
+            return output_tensor
+        return None
+
+    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
+        if envs.VLLM_BR_USE_FP32_ALL_REDUCE and input_ is not None and input_.dtype == torch.bfloat16:
+            logger.debug(
+                '[Patch] patch all_reduce: use fp32 all_reduce when env VLLM_BR_USE_FP32_ALL_REDUCE is set'
+            )
+            input_ = input_.to(torch.float32)
+            dist.all_reduce(input_, group=self.device_group)
+            input_ = input_.to(torch.bfloat16)
+        else:
+            dist.all_reduce(input_, group=self.device_group)
+        return input_