add dispatch_gmm_combine kernel (#3532)
### What this PR does / why we need it? This PR introduces the Ascend implementation of the `dispatch_ffn_combine` kernel and wires it into the vLLM-Ascend runtime, together with follow‑up fixes to ensure the kernel builds and runs correctly in CI. - Add full host and device implementation of the `dispatch_ffn_combine` kernel under `csrc/dispatch_ffn_combine`, including tiling logic, MOE routing helpers, and kernel utilities for quantized FFN dispatch. - Integrate the new kernel with the PyTorch binding (csrc/torch_binding.cpp, csrc/torch_binding_meta.cpp) and the Ascend runtime (vllm_ascend/ascend_forward_context.py, vllm_ascend/worker/model_runner_v1.py). - Extend fused MoE communication and token dispatch support in `vllm_ascend/ops/fused_moe`, adding methods/utilities needed by the new dispatch path. - Update quantization logic in vllm_ascend/quantization/w8a8_dynamic.py to support the new FFN dispatch flow. - Fix kernel build issues by adjusting `csrc/build_aclnn.sh`, CMake configuration, and include/namespace usage in the new kernel files. - Add an end‑to‑end nightly test `tests/e2e/nightly/ops/test_dispatch_ffn_combine.py` and helper utilities in `vllm_ascend/utils.py` to validate the new kernel. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.12.0 --------- Signed-off-by: mojave2 <chenchen145@huawei.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -24,6 +24,7 @@ from vllm.distributed import get_ep_group
|
||||
from vllm.forward_context import get_forward_context
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.ascend_forward_context import MoECommType
|
||||
from vllm_ascend.distributed.parallel_state import get_mc2_group
|
||||
from vllm_ascend.ops.fused_moe.experts_selector import select_experts
|
||||
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_enable_nz
|
||||
@@ -232,13 +233,15 @@ class AscendW8A8DynamicFusedMoEMethod:
|
||||
w2 = [layer.w2_weight]
|
||||
w2_scale = [layer.w2_weight_scale]
|
||||
|
||||
fused_flag = get_forward_context(
|
||||
).moe_comm_type == MoECommType.FUSED_ALLTOALL
|
||||
return moe_comm_method.fused_experts(
|
||||
hidden_states=x,
|
||||
pertoken_scale=pertoken_scale,
|
||||
w1=w1,
|
||||
w1_scale=w1_scale,
|
||||
w2=w2,
|
||||
w2_scale=w2_scale,
|
||||
w1=w1[0] if fused_flag else w1,
|
||||
w1_scale=layer.fused_w1_scale if fused_flag else w1_scale,
|
||||
w2=w2[0] if fused_flag else w2,
|
||||
w2_scale=layer.fused_w2_scale if fused_flag else w2_scale,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
use_int8_w8a8=True,
|
||||
@@ -270,6 +273,12 @@ class AscendW8A8DynamicFusedMoEMethod:
|
||||
layer.w2_weight_scale.data.shape[0], -1)
|
||||
layer.w2_weight_offset.data = layer.w2_weight_offset.data.view(
|
||||
layer.w2_weight_offset.data.shape[0], -1)
|
||||
|
||||
layer.fused_w1_scale = scale_from_float_to_int64(
|
||||
layer.w13_weight_scale.data)
|
||||
layer.fused_w2_scale = scale_from_float_to_int64(
|
||||
layer.w2_weight_scale.data)
|
||||
|
||||
if self.dynamic_eplb:
|
||||
layer.w13_weight_list = [
|
||||
weight.clone()
|
||||
@@ -292,3 +301,11 @@ class AscendW8A8DynamicFusedMoEMethod:
|
||||
del layer.w13_weight_scale_fp32
|
||||
del layer.w2_weight_scale
|
||||
torch.npu.empty_cache()
|
||||
|
||||
|
||||
def scale_from_float_to_int64(scale):
|
||||
import numpy as np
|
||||
scale = torch.from_numpy(
|
||||
np.frombuffer(scale.cpu().to(torch.float32).numpy().tobytes(),
|
||||
dtype=np.int32).astype(np.int64)).to(scale.device)
|
||||
return scale
|
||||
|
||||
Reference in New Issue
Block a user