[Bugfix][DispatchFFNCombine] resolve vec error caused by unaligned UB access (#6707)

### What this PR does / why we need it?
1. Fix a vec error caused by unaligned UB accesss in the
DispatchFFNCombine;
2. Fix expert_token_nums tensor defined on host instead of NPU in
moe_comm_method.py
3. Fix multi-core copy issue of expert_token_nums in dispatchffnCombine
op (single aiv copy is sufficient)

### Does this PR introduce _any_ user-facing change?

No, this PR does not introduce any user-facing changes. The fix only
addresses internal memory access logic and does not modify any public
APIs, interfaces, or user-visible behaviors.

### How was this patch tested?

`export VLLM_ASCEND_ENABLE_FUSED_MC2=1`

vLLM version: v0.15.0

- vLLM version: v0.15.0
- vLLM main:
9562912cea

Signed-off-by: xulei_ict <xulei292@huawei.com>
Co-authored-by: xulei_ict <xulei292@huawei.com>
This commit is contained in:
xulei
2026-02-14 10:32:50 +08:00
committed by GitHub
parent e2175d9c7e
commit 1e77077788
3 changed files with 19 additions and 12 deletions

View File

@@ -779,10 +779,10 @@ private:
AscendC::GlobalTensor<int32_t> ExpertTokenNums; AscendC::GlobalTensor<int32_t> ExpertTokenNums;
ExpertTokenNums.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t*>(params.ptrExpertTokenNums)); ExpertTokenNums.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t*>(params.ptrExpertTokenNums));
AscendC::GlobalTensor<int32_t> LcalCumsumMM; if(coreIdx == 0)
LcalCumsumMM.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t*>(workspaceInfo.ptrcumsumMM + (params.EP - 1) * params.expertPerRank * sizeof(int32_t))); {
CopyGMToGM(ExpertTokenNums, LcalCumsumMM, params.expertPerRank, params.ubMoveNum); CopyGMToGM(ExpertTokenNums, cumsumMM[(params.EP - 1) * params.expertPerRank], params.expertPerRank, params.ubMoveNum);
AscendC::SyncAll<true>(); }
uint16_t syncgmm1Idx = 0; uint16_t syncgmm1Idx = 0;
AscendC::CrossCoreSetFlag<0x2, PIPE_MTE3>(syncgmm1Idx / CROSS_CORE_FLAG_MAX_SET_COUNT); AscendC::CrossCoreSetFlag<0x2, PIPE_MTE3>(syncgmm1Idx / CROSS_CORE_FLAG_MAX_SET_COUNT);
syncgmm1Idx++; syncgmm1Idx++;
@@ -921,11 +921,11 @@ private:
AscendC::LocalTensor<float> statusTensor = resource.ubBuf.template GetBufferByByte<float>(uboffset); AscendC::LocalTensor<float> statusTensor = resource.ubBuf.template GetBufferByByte<float>(uboffset);
uboffset += sendRankNum_ * UB_ALIGN; uboffset += sendRankNum_ * UB_ALIGN;
AscendC::LocalTensor<float> gatherMaskOutTensor = resource.ubBuf.template GetBufferByByte<float>(uboffset); AscendC::LocalTensor<float> gatherMaskOutTensor = resource.ubBuf.template GetBufferByByte<float>(uboffset);
uboffset += params.EP * sizeof(float); uboffset += AlignUp(params.EP * sizeof(float), 32);
AscendC::LocalTensor<uint32_t> gatherTmpTensor = resource.ubBuf.template GetBufferByByte<uint32_t>(uboffset); AscendC::LocalTensor<uint32_t> gatherTmpTensor = resource.ubBuf.template GetBufferByByte<uint32_t>(uboffset);
uboffset += sizeof(uint32_t); uboffset += AlignUp(sizeof(uint32_t), 32);
AscendC::LocalTensor<float> statusSumOutTensor = resource.ubBuf.template GetBufferByByte<float>(uboffset); AscendC::LocalTensor<float> statusSumOutTensor = resource.ubBuf.template GetBufferByByte<float>(uboffset);
uboffset += sizeof(float); uboffset += AlignUp(sizeof(float), 32);
shmem.CrossRankSyncV2Wait(statusTensor, gatherMaskOutTensor, gatherTmpTensor, statusSumOutTensor); shmem.CrossRankSyncV2Wait(statusTensor, gatherMaskOutTensor, gatherTmpTensor, statusSumOutTensor);
MoeTokenUnpermuteTilingData tilingData; MoeTokenUnpermuteTilingData tilingData;
MoeTokenUnpermuteTiling(params.problemShape.m() * params.topK, n2, params.topK, tilingData, coreNum / 2); MoeTokenUnpermuteTiling(params.problemShape.m() * params.topK, n2, params.topK, tilingData, coreNum / 2);

View File

@@ -756,8 +756,9 @@ CATLASS_DEVICE
ExpertTokenNums.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t*>(params.ptrExpertTokenNums)); ExpertTokenNums.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t*>(params.ptrExpertTokenNums));
AscendC::GlobalTensor<int32_t> LcalCumsumMM; AscendC::GlobalTensor<int32_t> LcalCumsumMM;
LcalCumsumMM.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t*>(workspaceInfo.ptrcumsumMM + (params.EP - 1) * params.expertPerRank * sizeof(int32_t))); LcalCumsumMM.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t*>(workspaceInfo.ptrcumsumMM + (params.EP - 1) * params.expertPerRank * sizeof(int32_t)));
CopyGMToGM(ExpertTokenNums, LcalCumsumMM, params.expertPerRank, params.ubMoveNum); if (coreIdx == 0) {
AscendC::SyncAll<true>(); CopyGMToGM(ExpertTokenNums, LcalCumsumMM, params.expertPerRank, params.ubMoveNum);
}
uint32_t curGroupOffset = 0; uint32_t curGroupOffset = 0;
int32_t prevSumBeforeRank = 0; int32_t prevSumBeforeRank = 0;

View File

@@ -277,6 +277,13 @@ class FusedMC2CommImpl(MoECommMethod):
Communication and Computation parallelism on Ascend devices. Communication and Computation parallelism on Ascend devices.
""" """
def __init__(self, moe_config):
super().__init__(moe_config)
if envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 1:
self.expert_token_nums = torch.zeros([self.moe_config.num_local_experts], dtype=torch.int32, device="npu")
else:
self.expert_token_nums = None
def _get_token_dispatcher(self): def _get_token_dispatcher(self):
return TokenDispatcherWithMC2() return TokenDispatcherWithMC2()
@@ -325,7 +332,6 @@ class FusedMC2CommImpl(MoECommMethod):
expert_tokens = None expert_tokens = None
if envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 1: if envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 1:
out = torch.empty_like(hidden_states) out = torch.empty_like(hidden_states)
expert_token_nums = torch.zeros([self.moe_config.num_local_experts], dtype=torch.int32)
torch.ops._C_ascend.dispatch_ffn_combine( # type: ignore torch.ops._C_ascend.dispatch_ffn_combine( # type: ignore
x=hidden_states, x=hidden_states,
weight1=w1, weight1=w1,
@@ -337,9 +343,9 @@ class FusedMC2CommImpl(MoECommMethod):
group=self.token_dispatcher.moe_all_to_all_group_name, group=self.token_dispatcher.moe_all_to_all_group_name,
max_output_size=65536, max_output_size=65536,
out=out, out=out,
expert_token_nums=expert_token_nums, expert_token_nums=self.expert_token_nums,
) )
expert_tokens = expert_token_nums expert_tokens = self.expert_token_nums
elif envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 2: elif envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 2:
assert expert_map is not None, "expert_map cannot be None." assert expert_map is not None, "expert_map cannot be None."
group_list_type = 1 group_list_type = 1