Combine fp4.py and mxfp4.py into one file and support dynamic mxfp4 quantization in mxfp4.py (#9049)
Co-authored-by: wunhuang <wunhuang@amd.com>
This commit is contained in:
@@ -33,6 +33,7 @@ from sglang.srt.utils import (
|
||||
configure_ipv6,
|
||||
get_device,
|
||||
get_device_memory_capacity,
|
||||
is_cuda,
|
||||
is_flashinfer_available,
|
||||
is_hip,
|
||||
is_port_available,
|
||||
@@ -2165,9 +2166,9 @@ class ServerArgs:
|
||||
model_arch = hf_config.architectures[0]
|
||||
if model_arch in ["GptOssForCausalLM"]:
|
||||
if self.attention_backend is None:
|
||||
if is_sm100_supported():
|
||||
if is_cuda() and is_sm100_supported():
|
||||
self.attention_backend = "trtllm_mha"
|
||||
elif is_sm90_supported():
|
||||
elif is_cuda() and is_sm90_supported():
|
||||
self.attention_backend = "fa3"
|
||||
else:
|
||||
self.attention_backend = "triton"
|
||||
|
||||
Reference in New Issue
Block a user