Combine fp4.py and mxfp4.py into one file and support dynamic mxfp4 quantization in mxfp4.py (#9049)

Co-authored-by: wunhuang <wunhuang@amd.com>
2025-08-17 10:01:54 +08:00
parent 384f8ab5ce
commit 1c1f8a118e
7 changed files with 760 additions and 557 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -33,6 +33,7 @@ from sglang.srt.utils import (
    configure_ipv6,
    get_device,
    get_device_memory_capacity,
+    is_cuda,
    is_flashinfer_available,
    is_hip,
    is_port_available,
@@ -2165,9 +2166,9 @@ class ServerArgs:
        model_arch = hf_config.architectures[0]
        if model_arch in ["GptOssForCausalLM"]:
            if self.attention_backend is None:
-                if is_sm100_supported():
+                if is_cuda() and is_sm100_supported():
                    self.attention_backend = "trtllm_mha"
-                elif is_sm90_supported():
+                elif is_cuda() and is_sm90_supported():
                    self.attention_backend = "fa3"
                else:
                    self.attention_backend = "triton"