[2/n]decouple quantization implementation from vLLM dependency (#8112)
Co-authored-by: walker-ai <yiyun.wyt@antgroup.com> Co-authored-by: leoneo <1320612015@qq.com>
This commit is contained in:
@@ -2,6 +2,7 @@ import functools
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from sgl_kernel import silu_and_mul
|
||||
|
||||
|
||||
def get_scalar_type(num_bits: int, has_zp: bool):
|
||||
@@ -165,7 +166,7 @@ def fused_marlin_moe(
|
||||
is_zp_float=False,
|
||||
)
|
||||
|
||||
torch.ops._C.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
|
||||
silu_and_mul(intermediate_cache1.view(-1, 2 * N), intermediate_cache2)
|
||||
|
||||
if expert_map is not None:
|
||||
intermediate_cache3.zero_()
|
||||
|
||||
Reference in New Issue
Block a user