[Auto Sync] Update modelopt_quant.py (20250920) (#10688)
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
@@ -77,6 +77,9 @@ logger = logging.getLogger(__name__)
|
|||||||
CUTEDSL_MOE_SCALAR_INPUT_SCALE = get_bool_env_var(
|
CUTEDSL_MOE_SCALAR_INPUT_SCALE = get_bool_env_var(
|
||||||
"SGLANG_CUTEDSL_MOE_SCALAR_INPUT_SCALE", "true"
|
"SGLANG_CUTEDSL_MOE_SCALAR_INPUT_SCALE", "true"
|
||||||
)
|
)
|
||||||
|
USE_CUTLASS_BACKEND_FOR_FP4_GEMM = get_bool_env_var(
|
||||||
|
"SGLANG_USE_CUTLASS_BACKEND_FOR_FP4_GEMM"
|
||||||
|
)
|
||||||
|
|
||||||
# Supported activation schemes for the current configuration
|
# Supported activation schemes for the current configuration
|
||||||
ACTIVATION_SCHEMES = ["static"]
|
ACTIVATION_SCHEMES = ["static"]
|
||||||
@@ -844,14 +847,25 @@ class ModelOptFp4LinearMethod(LinearMethodBase):
|
|||||||
if enable_flashinfer_fp4_gemm:
|
if enable_flashinfer_fp4_gemm:
|
||||||
w = layer.weight.T
|
w = layer.weight.T
|
||||||
w_scale_interleaved = layer.weight_scale_interleaved.T
|
w_scale_interleaved = layer.weight_scale_interleaved.T
|
||||||
out = fp4_gemm(
|
if USE_CUTLASS_BACKEND_FOR_FP4_GEMM:
|
||||||
x_fp4,
|
out = fp4_gemm(
|
||||||
w,
|
x_fp4,
|
||||||
x_scale_interleaved,
|
w,
|
||||||
w_scale_interleaved,
|
x_scale_interleaved,
|
||||||
layer.alpha,
|
w_scale_interleaved,
|
||||||
output_dtype,
|
layer.alpha,
|
||||||
)
|
output_dtype,
|
||||||
|
backend="cutlass",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
out = fp4_gemm(
|
||||||
|
x_fp4,
|
||||||
|
w,
|
||||||
|
x_scale_interleaved,
|
||||||
|
w_scale_interleaved,
|
||||||
|
layer.alpha,
|
||||||
|
output_dtype,
|
||||||
|
)
|
||||||
if bias is not None:
|
if bias is not None:
|
||||||
out = out + bias
|
out = out + bias
|
||||||
return out.view(*output_shape)
|
return out.view(*output_shape)
|
||||||
|
|||||||
Reference in New Issue
Block a user