set default attention backend for deterministic inference (#11801)

2025-10-18 00:01:24 -07:00
parent e68a2b5b2f
commit f4488e9dd9
2 changed files with 26 additions and 2 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -44,6 +44,7 @@ from sglang.srt.utils import (
    is_remote_url,
    is_sm90_supported,
    is_sm100_supported,
    is_sm120_supported,
    is_triton_kernels_available,
    is_valid_ipv6_address,
    json_list_type,
@@ -1411,9 +1412,23 @@ class ServerArgs:
            )
            # Check attention backend
-            if self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
+            if self.attention_backend is None:
                # User didn't specify attention backend, fallback based on GPU architecture
                if is_sm100_supported() or is_sm120_supported():
                    # Blackwell and newer architectures
                    self.attention_backend = "flashinfer"
                else:
                    # Hopper (SM90) and older architectures
                    self.attention_backend = "fa3"
                logger.warning(
                    f"Attention backend not specified. Falling back to '{self.attention_backend}' for deterministic inference. "
                    f"You can explicitly set --attention-backend to one of {DETERMINISTIC_ATTENTION_BACKEND_CHOICES}."
                )
            elif self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
                # User explicitly specified an incompatible attention backend
                raise ValueError(
-                    f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference."
+                    f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference, "
                    f"but you explicitly specified '{self.attention_backend}'."
                )
            # Currently, only FA3 supports radix cache. Support for other backends is in progress
--- a/python/sglang/srt/utils/common.py
+++ b/python/sglang/srt/utils/common.py
@@ -174,6 +174,15 @@ def is_blackwell():
    return torch.cuda.get_device_capability()[0] == 10
@lru_cache(maxsize=1)
 def is_sm120_supported(device=None) -> bool:
    if not is_cuda_alike():
        return False
    return (torch.cuda.get_device_capability(device)[0] == 12) and (
        torch.version.cuda >= "12.8"
    )
@lru_cache(maxsize=1)
 def is_sm100_supported(device=None) -> bool:
    if not is_cuda_alike():