[NVIDIA] FA3/FA4 Fix (#11606)

Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
2025-10-20 02:10:10 +02:00
parent cbb5fc2edc
commit 252dc4e112
10 changed files with 382 additions and 219 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -1071,6 +1071,16 @@ class ServerArgs:
            self.enable_mixed_chunk = False
            self.disable_radix_cache = True

+        if self.attention_backend == "fa4" or self.decode_attention_backend == "fa4":
+            raise ValueError(
+                "FA4 backend is only supported for prefill. Please use `--prefill-attention-backend fa4` instead."
+            )
+        if self.prefill_attention_backend == "fa4":
+            logger.warning(
+                f"FA4 backend only supports page size 128, changing page_size from {self.page_size} to 128."
+            )
+            self.page_size = 128
+
    def _handle_page_size(self):
        if self.page_size is None:
            self.page_size = 1
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -129,6 +129,11 @@ def is_in_amd_ci():
    return get_bool_env_var("SGLANG_IS_IN_CI_AMD")


+def is_blackwell_system():
+    """Return whether it is running on a Blackwell (B200) system."""
+    return get_bool_env_var("IS_BLACKWELL")
+
+
 def _use_cached_default_models(model_repo: str):
    cache_dir = os.getenv("DEFAULT_MODEL_CACHE_DIR")
    if cache_dir and model_repo:
@@ -151,6 +156,9 @@ DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 10
 if is_in_amd_ci():
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 3000

+if is_blackwell_system():
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 3000
+

 def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
    assert url is not None