[BugFix]add all2all when dp_size > 1 && downgrade npu_dequant_swiglu_quant (#819)

<!--  Thanks for sending a pull request!

BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html

-->
### What this PR does / why we need it?
1. This PR introduces native `all_to_all` communication operator to fix
`allgather` bugs when dp_size > 1. Besides, it adds a naive
implementation of force-load-balance when doing profile runs.
2. The operator `npu_dequant_swiglu_quant` only supports input
hidden_states with dtype `torch.int32`. This tensor occupies space of
`global_bs * seq_len * topk * hidden_size`, which might be very large as
`ep_size` grows. Therefore we need to disable this operator and use
original `swiglu` && `quantize`.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
By performing offline inference:

![image](https://github.com/user-attachments/assets/e003d5dc-0753-41ae-9303-e87f73ac6828)

---------

Signed-off-by: angazenn <zengyanjia@huawei.com>
Co-authored-by: angazenn <zengyanjia@huawei.com>
This commit is contained in:
Angazenn
2025-05-15 09:19:55 +08:00
committed by GitHub
parent 68fb63428b
commit 1e67089bc9
7 changed files with 317 additions and 80 deletions

View File

@@ -14,10 +14,10 @@
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from typing import Optional
from typing import List, Optional
import torch
from torch.distributed import ProcessGroup
import torch.distributed as dist
from vllm.distributed.device_communicators.base_device_communicator import \
DeviceCommunicatorBase
@@ -25,11 +25,51 @@ from vllm.distributed.device_communicators.base_device_communicator import \
class NPUCommunicator(DeviceCommunicatorBase):
def __init__(self,
cpu_group: ProcessGroup,
cpu_group: dist.ProcessGroup,
device: Optional[torch.device] = None,
device_group: Optional[ProcessGroup] = None,
device_group: Optional[dist.ProcessGroup] = None,
unique_name: str = ""):
super().__init__(cpu_group, device, device_group, unique_name)
# TODO(hz): Refer to CudaCommunicator's implementation to integrate PyHcclCommunicator
# init device according to rank
self.device = torch.npu.current_device()
def all_to_all(self,
input_: torch.Tensor,
scatter_dim: int = 0,
gather_dim: int = -1,
scatter_sizes: Optional[List[int]] = None,
gather_sizes: Optional[List[int]] = None) -> torch.Tensor:
if scatter_dim < 0:
scatter_dim += input_.dim()
if gather_dim < 0:
gather_dim += input_.dim()
if scatter_sizes is not None and gather_sizes is not None:
input_list = [
t.contiguous()
for t in torch.split(input_, scatter_sizes, scatter_dim)
]
output_list = []
tensor_shape_base = input_list[self.rank].size()
for i in range(self.world_size):
tensor_shape = list(tensor_shape_base)
tensor_shape[gather_dim] = gather_sizes[i]
output_list.append(
torch.empty(tensor_shape,
dtype=input_.dtype,
device=input_.device))
else:
input_list = [
t.contiguous() for t in torch.tensor_split(
input_, self.world_size, scatter_dim)
]
output_list = [
torch.empty_like(input_list[i]) for i in range(self.world_size)
]
dist.all_to_all(output_list, input_list, group=self.device_group)
output_tensor = torch.cat(output_list, dim=gather_dim).contiguous()
return output_tensor