Reduce overhead for fa by not calling heavy CUDA property check (#7375)

2025-08-20 16:26:28 +08:00
parent de2dd73831
commit c9bf3877a0
2 changed files with 7 additions and 5 deletions
--- a/sgl-kernel/tests/test_flash_attention.py
+++ b/sgl-kernel/tests/test_flash_attention.py
@@ -25,10 +25,10 @@ def is_fa3_supported(device=None) -> bool:
    #  https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x
    #  And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a.
    #  That means if you use A100/A*0/L20/L40/L40s/4090 you can use fa3.
-    return (
+    return (torch.version.cuda >= "12.3") and (
        torch.cuda.get_device_capability(device)[0] == 9
        or torch.cuda.get_device_capability(device)[0] == 8
-    ) and (torch.version.cuda >= "12.3")
+    )


 DISABLE_BACKWARD = True