add dispatch_gmm_combine kernel (#3532)

### What this PR does / why we need it? This PR introduces the Ascend implementation of the `dispatch_ffn_combine` kernel and wires it into the vLLM-Ascend runtime, together with follow‑up fixes to ensure the kernel builds and runs correctly in CI. - Add full host and device implementation of the `dispatch_ffn_combine` kernel under `csrc/dispatch_ffn_combine`, including tiling logic, MOE routing helpers, and kernel utilities for quantized FFN dispatch. - Integrate the new kernel with the PyTorch binding (csrc/torch_binding.cpp, csrc/torch_binding_meta.cpp) and the Ascend runtime (vllm_ascend/ascend_forward_context.py, vllm_ascend/worker/model_runner_v1.py). - Extend fused MoE communication and token dispatch support in `vllm_ascend/ops/fused_moe`, adding methods/utilities needed by the new dispatch path. - Update quantization logic in vllm_ascend/quantization/w8a8_dynamic.py to support the new FFN dispatch flow. - Fix kernel build issues by adjusting `csrc/build_aclnn.sh`, CMake configuration, and include/namespace usage in the new kernel files. - Add an end‑to‑end nightly test `tests/e2e/nightly/ops/test_dispatch_ffn_combine.py` and helper utilities in `vllm_ascend/utils.py` to validate the new kernel. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.12.0 --------- Signed-off-by: mojave2 <chenchen145@huawei.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-04 23:00:59 +08:00
parent 752a55473c
commit ad0607f900
61 changed files with 9795 additions and 53 deletions
--- a/vllm_ascend/quantization/w8a8_dynamic.py
+++ b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -24,6 +24,7 @@ from vllm.distributed import get_ep_group
 from vllm.forward_context import get_forward_context

 from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.distributed.parallel_state import get_mc2_group
 from vllm_ascend.ops.fused_moe.experts_selector import select_experts
 from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_enable_nz
@@ -232,13 +233,15 @@ class AscendW8A8DynamicFusedMoEMethod:
            w2 = [layer.w2_weight]
            w2_scale = [layer.w2_weight_scale]

+        fused_flag = get_forward_context(
+        ).moe_comm_type == MoECommType.FUSED_ALLTOALL
        return moe_comm_method.fused_experts(
            hidden_states=x,
            pertoken_scale=pertoken_scale,
-            w1=w1,
-            w1_scale=w1_scale,
-            w2=w2,
-            w2_scale=w2_scale,
+            w1=w1[0] if fused_flag else w1,
+            w1_scale=layer.fused_w1_scale if fused_flag else w1_scale,
+            w2=w2[0] if fused_flag else w2,
+            w2_scale=layer.fused_w2_scale if fused_flag else w2_scale,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            use_int8_w8a8=True,
@@ -270,6 +273,12 @@ class AscendW8A8DynamicFusedMoEMethod:
            layer.w2_weight_scale.data.shape[0], -1)
        layer.w2_weight_offset.data = layer.w2_weight_offset.data.view(
            layer.w2_weight_offset.data.shape[0], -1)
+
+        layer.fused_w1_scale = scale_from_float_to_int64(
+            layer.w13_weight_scale.data)
+        layer.fused_w2_scale = scale_from_float_to_int64(
+            layer.w2_weight_scale.data)
+
        if self.dynamic_eplb:
            layer.w13_weight_list = [
                weight.clone()
@@ -292,3 +301,11 @@ class AscendW8A8DynamicFusedMoEMethod:
            del layer.w13_weight_scale_fp32
            del layer.w2_weight_scale
            torch.npu.empty_cache()
+
+
+def scale_from_float_to_int64(scale):
+    import numpy as np
+    scale = torch.from_numpy(
+        np.frombuffer(scale.cpu().to(torch.float32).numpy().tobytes(),
+                      dtype=np.int32).astype(np.int64)).to(scale.device)
+    return scale