Combine fp4.py and mxfp4.py into one file and support dynamic mxfp4 quantization in mxfp4.py (#9049)

Co-authored-by: wunhuang <wunhuang@amd.com>
This commit is contained in:
kk
2025-08-17 10:01:54 +08:00
committed by GitHub
parent 384f8ab5ce
commit 1c1f8a118e
7 changed files with 760 additions and 557 deletions

View File

@@ -33,6 +33,7 @@ from sglang.srt.utils import (
configure_ipv6,
get_device,
get_device_memory_capacity,
is_cuda,
is_flashinfer_available,
is_hip,
is_port_available,
@@ -2165,9 +2166,9 @@ class ServerArgs:
model_arch = hf_config.architectures[0]
if model_arch in ["GptOssForCausalLM"]:
if self.attention_backend is None:
if is_sm100_supported():
if is_cuda() and is_sm100_supported():
self.attention_backend = "trtllm_mha"
elif is_sm90_supported():
elif is_cuda() and is_sm90_supported():
self.attention_backend = "fa3"
else:
self.attention_backend = "triton"