[NVIDIA] FA3/FA4 Fix (#11606)

Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
This commit is contained in:
Johnny
2025-10-20 02:10:10 +02:00
committed by GitHub
parent cbb5fc2edc
commit 252dc4e112
10 changed files with 382 additions and 219 deletions

View File

@@ -1071,6 +1071,16 @@ class ServerArgs:
self.enable_mixed_chunk = False
self.disable_radix_cache = True
if self.attention_backend == "fa4" or self.decode_attention_backend == "fa4":
raise ValueError(
"FA4 backend is only supported for prefill. Please use `--prefill-attention-backend fa4` instead."
)
if self.prefill_attention_backend == "fa4":
logger.warning(
f"FA4 backend only supports page size 128, changing page_size from {self.page_size} to 128."
)
self.page_size = 128
def _handle_page_size(self):
if self.page_size is None:
self.page_size = 1

View File

@@ -129,6 +129,11 @@ def is_in_amd_ci():
return get_bool_env_var("SGLANG_IS_IN_CI_AMD")
def is_blackwell_system():
"""Return whether it is running on a Blackwell (B200) system."""
return get_bool_env_var("IS_BLACKWELL")
def _use_cached_default_models(model_repo: str):
cache_dir = os.getenv("DEFAULT_MODEL_CACHE_DIR")
if cache_dir and model_repo:
@@ -151,6 +156,9 @@ DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 10
if is_in_amd_ci():
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 3000
if is_blackwell_system():
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 3000
def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
assert url is not None