ROCm: update aiter and its usage to fused moe (bloat16, fp8, fp8 block-quant) (#4053)

2025-03-04 03:00:46 -08:00
parent e0a2c96308
commit 51d25405a7
4 changed files with 82 additions and 40 deletions
--- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -29,6 +29,9 @@ import logging

 is_hip_ = is_hip()

+if is_hip_:
+    from aiter import ck_moe
+
 logger = logging.getLogger(__name__)


@@ -173,18 +176,20 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
        )

        if is_hip_ and get_bool_env_var("CK_MOE"):
-            import aiter
-            from aiter.fused_moe import fused_experts_ck
-
-            assert activation == "silu", f"{activation=} is not supported."
            assert not no_combine, "unsupported"
-
-            return fused_experts_ck(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
+            return ck_moe(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                None,
+                None,
+                None,
+                None,
+                32,
+                None,
+                activation,
            )
        else:
            return fused_experts(