[2/n]decouple quantization implementation from vLLM dependency (#8112)

Co-authored-by: walker-ai <yiyun.wyt@antgroup.com> Co-authored-by: leoneo <1320612015@qq.com>
2025-08-14 18:19:03 +08:00
parent 4dbf43601d
commit 5aa1ebd242
32 changed files with 6506 additions and 202 deletions
--- a/sgl-kernel/python/sgl_kernel/fused_moe.py
+++ b/sgl-kernel/python/sgl_kernel/fused_moe.py
@@ -2,6 +2,7 @@ import functools
 from typing import Optional

 import torch
+from sgl_kernel import silu_and_mul


 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -165,7 +166,7 @@ def fused_marlin_moe(
        is_zp_float=False,
    )

-    torch.ops._C.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+    silu_and_mul(intermediate_cache1.view(-1, 2 * N), intermediate_cache2)

    if expert_map is not None:
        intermediate_cache3.zero_()